researchv10no/cmd/sort/merge.c - annotate

Return to merge.c CVS log
Up to [Research Unix] / researchv10no / cmd / sort
Annotation of researchv10no/cmd/sort/merge.c, revision 1.1.1.1

1.1       root        1: /* Copyright 1990, AT&T Bell Labs */
                      2: #include <stdlib.h>
                      3: #include <string.h>
                      4: #include "fsort.h"
                      5: 
                      6: char *disorder = "key out of order";
                      7: char *duplicate = "duplicate key";
                      8: char *space = "out of space for merging";
                      9: 
                     10: enum { IEOF, IOK };
                     11: enum { IPRE = 01000, IFOL = 02000 }; /* see find() */
                     12: enum { NMERGE = 16 };
                     13: 
                     14: int nmerge = NMERGE;   /* max number of files to merge at once */
                     15: int nextfile;
                     16: 
                     17: struct merge {         /* current record in an open file */
                     18:        char *name;     /* name of file (for diagnostics) */
                     19:        FILE *file;
                     20:        struct rec *rec; /* pointer to the data (and key) */
                     21:        short serial;   /* file no., breaks ties for stable sort */
                     22:        short del;      /* delete at close */
                     23: };
                     24: 
                     25: struct merge *mfile;   /* one struct per open file */
                     26: struct merge **f;      /* pointers to mfile, ordered by content */
                     27: 
                     28: static void mergephase(int, char*);
                     29: static int find(int, int*);
                     30: static int fillrec(struct merge*);
                     31: static int compare(struct merge*, struct merge*);
                     32: static void syncerr(struct merge*, char*);
                     33: extern int link(char*, char*);
                     34: extern int unlink(char*);
                     35: static void recinit(int nf, int nm, int n);
                     36: static void mv(int, int);
                     37: 
                     38: static void
                     39: recalloc(struct merge *m)
                     40: {
                     41:        if(m->rec)
                     42:                return;         /* hygiene; doesn't happen */
                     43:        m->rec = (struct rec*)malloc(MINREC);
                     44:        if(m->rec == 0)
                     45:                fatal(space, "", 0);
                     46:        m->rec->dlen = 0;
                     47:        m->rec->next = (struct rec*)((uchar*)m->rec + MINREC);
                     48: }
                     49: 
                     50: /* misfortune : all fields are parsed and converted 
                     51:    before comparison.  Lazy parsing might be in order,
                     52:    but it's too hard to contemplate */
                     53: 
                     54: /* Merge strategy, to merge N inputs in groups of at most M.
                     55:    Numbers, like (1), are keyed to comments in code.
                     56:    If N <= M, merge everything. (4)
                     57:    If N > M*(M-1)+1
                     58:    greedily merge M at a time (1) to reduce to the next case.
                     59:    Merge x bunches of M (3) and one bunch of y (2),
                     60:    then merge the x+1 new files (4) with the remaining z files
                     61:    in an exactly M-way merge. Calculate x,y,z thus:
                     62:        Mx + y + z = N  file count
                     63:        x + 1 + z = M   final merge
                     64:         2 <= y <= M
                     65:        x >= 0
                     66:        z >= 0
                     67:    by algebra
                     68:        (M-1)x + y = N-M+1
                     69:         y == (N-M+1) mod (M-1),   2 <= y <= M
                     70:         x == (n-M+1-y)/(M-1)
                     71:    Total cost is Mx + y + N.  (Strategy due to P. McIlroy.) */
                     72: 
                     73: /* merge n files.  the first unmerged temp file is nm (i.e.
                     74:    nm is the number of already merged files).  the first
                     75:    unmerged input file is nf.
                     76:    rename temp files to keep a dense set beginning at 0
                     77:    (It seems silly to rename 100 files after merging
                     78:    every 1; how much time does that actually cost?) */
                     79: void
                     80: merge1(int nf, int nm, int n)
                     81: {
                     82:        char *name;
                     83:        int i, j;
                     84:        int flag = nf<nfiles;
                     85:        recinit(nf, nm, n);
                     86:        name = filename(nextfile++);
                     87:        mergephase(n, name);
                     88:        free(name);
                     89:        if(flag)
                     90:                return;
                     91:        mv(--nextfile, nm);
                     92:        for(i=nm+n, j=nm+1; i<nextfile; i++, j++)
                     93:                mv(i, j);
                     94:        nextfile -= n - 1;
                     95: }
                     96: 
                     97: /* if flag = 0 merge input files; else temps only.
                     98:    this program is specialized to two levels of merge;
                     99:    should fix that some day. */
                    100: 
                    101: void
                    102: merge(int flag)
                    103: {
                    104:        char buf[BUFSIZ];
                    105:        FILE *input, *output;
                    106:        int n, y;
                    107:        char *name;
                    108:        int nf = 0;             /* next input file to merge */
                    109:        int nm = 0;             /* total already merged once */
                    110:        int nt = nfiles;        /* files as yet unmerged */
                    111:        if(flag) {
                    112:                nf = nfiles;
                    113:                nt = nextfile;
                    114:        }
                    115:        if(mfile == 0)
                    116:                mfile = (struct merge*)calloc(nmerge+1,
                    117:                                              sizeof(*mfile));
                    118:        if(f == 0)
                    119:                f = (struct merge**)calloc(nmerge+1,
                    120:                                               sizeof(*f));
                    121:        if(mfile==0 || f==0)
                    122:                fatal(space,"",0);
                    123: 
                    124:        if(nt > nmerge*(nmerge-1)+1) {          /* (1) */
                    125:                for(nm=0; nt>0; ) {
                    126:                        n = nt>nmerge? nmerge: nt;
                    127:                        merge1(nf, nm, n);
                    128:                        if(flag) nf += n;
                    129:                        nt -= n;
                    130:                        nm++;
                    131:                }
                    132:                merge(1);
                    133:                return;
                    134:        }
                    135:        if(nt > nmerge) {                       /* (2) */
                    136:                y = (nt-nmerge+1)%(nmerge-1);
                    137:                if(y <= 1) y += nmerge-1;
                    138:                merge1(nf, nm, y);
                    139:                nf = nf+y>=nfiles? nfiles: nf+y;
                    140:                nt -= y;
                    141:                nm++;
                    142:        }
                    143:        while(nt+nm > nmerge) {                 /* (3) */
                    144:                merge1(nf, nm, nmerge);
                    145:                nf = nf+nmerge>=nfiles? nfiles: nf+nmerge;
                    146:                nt -= nmerge;
                    147:                nm++;
                    148:        }
                    149: 
                    150:        n = nm+nt;                              /* (4) */
                    151:        recinit(nf, 0, n);
                    152:        if(!overwrite(nf))
                    153:                name = oname;
                    154:        else
                    155:                name = filename(nextfile++);
                    156:        mergephase(n, name);
                    157:        if(name == oname)
                    158:                return;
                    159:        input = fileopen(name, "r");
                    160:        output = fileopen(oname, "w");
                    161:        while(n = fread(buf, 1, sizeof(buf), input))
                    162:                if(fwrite(buf, 1, n, output) != n)
                    163:                        fatal("error writing", oname, 0);
                    164:        fileclose(input, name);
                    165:        unlink(name);
                    166:        free(name);
                    167:        fileclose(output, oname);
                    168: }
                    169: 
                    170: /* Initialize merge records for n files, beginning with
                    171:    input file number nf and temp file number nm.
                    172: 
                    173:    There are nfiles-nf input files; any files beyond that
                    174:    number must be temporaries, which come from already
                    175:    merged inputs.  (The only time that both kinds of
                    176:    files are present is the very last merge, when for
                    177:    stability, the temps are regarded as earlier.) */
                    178: 
                    179: static void
                    180: recinit(int nf, int nm, int n)
                    181: {
                    182:        int i;
                    183:        struct merge *m;
                    184:        int last = nfiles-nf + nextfile-nm == n;
                    185:        for(i=0; i<=n; i++)     /* one extra, for mergephase() */
                    186:                recalloc(&mfile[i]);
                    187:        for(i=0; i<n; i++) {
                    188:                m = &mfile[i];
                    189:                m->del = nf>=nfiles || last && i<nextfile;
                    190:                m->name = m->del? filename(nm++): files[nf++];
                    191:                m->file = fileopen(m->name, "r");
                    192:                m->serial = i;
                    193:                f[i] = m;
                    194:        }
                    195:        f[n] = &mfile[n];
                    196: }
                    197: 
                    198: static void
                    199: mv(int i, int j)
                    200: {
                    201:        char *old, *new;
                    202:        if(i == j) return;      /* believed not to happen */
                    203:        old = filename(i);
                    204:        new = filename(j);
                    205:        unlink(new);
                    206:        if(link(old,new) == -1 || unlink(old) == -1)
                    207:                fatal("cannot move", old, 0);
                    208:        free(old);
                    209:        free(new);
                    210: }
                    211: 
                    212: static void
                    213: emit(FILE *output)
                    214: {
                    215:        struct merge *m = f[0];
                    216:        int n = m->rec->dlen;
                    217:        uchar *p = data(m->rec);        /* hanky panky for speed */
                    218:        uchar *e = p + n++;             /* use fwrite instead of */
                    219:        int c = *e;                     /* printf */
                    220:        *e = '\n';
                    221:        if(fwrite((char*)p, 1, n, output) != n)
                    222:                fatal("error writing", m->name, 0);
                    223:        *e = c;
                    224: }
                    225: 
                    226: /* Merge files in mfile[0]..mfile[n-1]. 
                    227:    f[0]..f[l-1] points to files in increasing order
                    228:    of their current records.  All the current records
                    229:    are strictly ordered, either by tie-breaking, or
                    230:    by skipping records that compare equal under -u.
                    231:    There are two phases: (a) initialization (reading
                    232:    from a file when there is no record at hand) and
                    233:    (b) continuation (reading the next record with the
                    234:    intent of displacing the current record).
                    235:    These conditions arise, at places numbered in comments
                    236:    (1) the record compares equal to another, and would follow
                    237:    that other if ties were broken
                    238:    (2) the record compares equal to another, and would precede
                    239:    that other if ties were broken
                    240:    (3) the record falls between two others
                    241: 
                    242:    At all times the keys pointed to by f[0]..f[l]
                    243:    are distinct (perhaps due to tie-breaking), as
                    244:    are the files they come from.  When a new key
                    245:    is read, its proper home (in tie-broken order) 
                    246:    is between f[j-1] and f[j].
                    247:    The picture below shows the hardest case (2b),
                    248:    where key a from file F is about to be emitted.
                    249:    All keys in the sequence axby are distinct, as
                    250:    are the files they come from.  A new key is read
                    251:    from file F into the extra space pointed to by
                    252:    f[i].  (Keeping one extra key allows disorder to
                    253:    be detected almost for free.)  The
                    254:    new key is equal to key b pointed to by f[j]
                    255:    and file F precedes file G in stability order.
                    256:    Thus b(F) deserves to be kept and b(G) discarded.
                    257:     0                 j                 l=n
                    258:    --------------------------------------------
                    259:    | a(F) |     x    | b(G) |     y    | b(F) |
                    260:    --------------------------------------------
                    261:   
                    262:    We emit the record at 0, and displace the key
                    263:    at j, from whose file there is now no record
                    264:    present; l decreases by one.  
                    265:     0                 j          l      n
                    266:    --------------------------------------------
                    267:    |     x    | b(F) |     y    | ?(G) |      |
                    268:    --------------------------------------------
                    269: 
                    270:    This takes us back to initialization.  From
                    271:    this configuration it is possible to reach
                    272:    case (2a), like (2b) except that the key at 0
                    273:    is not to be emitted; (2a) cannot happen during
                    274:    the original initialization.
                    275: */
                    276: 
                    277: #define moveup(a,n) memmove(a+1, a, (n)*sizeof(*a))
                    278: #define movedown(a,n) memmove(a-1, a, (n)*sizeof(*a))
                    279: 
                    280: static void
                    281: mergephase(int n, char *name)
                    282: {
                    283:        int j, flags;
                    284:        struct merge *mp;
                    285:        struct rec *r;
                    286:        int l = 0;
                    287:        int k = -1;             /* for spotting sync errors */
                    288:        FILE *output = fileopen(name, "w");
                    289: init:  while(l < n) {                                  /*(a)*/
                    290:                mp = f[l];
                    291:                if(fillrec(mp) == IEOF) {
                    292:                        movedown(f+l+1, n-l);
                    293:                        f[n--] = mp;
                    294:                        continue;
                    295:                }
                    296:                j = find(l, &flags);
                    297:                if(j <= k)
                    298:                        syncerr(mp, disorder);
                    299:                if(flags & IFOL) {                      /*(1a)*/
                    300:                        if(!aflag || doaccum(f[j-1]->rec, mp->rec))
                    301:                                continue;
                    302:                } else if(flags & IPRE)                 /*(2a)*/
                    303:                        if(!aflag || doaccum(mp->rec, f[j]->rec)) {
                    304:                                f[l] = f[j];
                    305:                                f[j] = mp;
                    306:                                k = j;
                    307:                                continue;
                    308:                        }
                    309:                moveup(f+j, l-j);                       /*(3a)*/
                    310:                f[j] = mp;
                    311:                l++;
                    312:        }
                    313:        while(n > 0) {                                  /*(b)*/
                    314:                mp = f[l];
                    315:                r = mp->rec;
                    316:                *mp = *f[0];
                    317:                mp->rec = r;
                    318:                if(fillrec(mp) == IEOF) {
                    319:                        emit(output);
                    320:                        mp = f[0];
                    321:                        movedown(f+1, l);
                    322:                        f[--n] = mp;
                    323:                        l = n;
                    324:                        continue;
                    325:                }
                    326:                j = find(l, &flags);
                    327:                if(j == 0)
                    328:                        syncerr(mp, disorder);
                    329:                if(flags & IFOL) {                      /*(1b)*/
                    330:                        if(!aflag || doaccum(f[j-1]->rec, mp->rec))
                    331:                                continue;
                    332:                } else if(flags & IPRE)                 /*(2b)*/
                    333:                        if(!aflag || doaccum(mp->rec, f[j]->rec)) {
                    334:                                emit(output);
                    335:                                f[l] = f[j];
                    336:                                f[j] = mp;
                    337:                                mp = f[0];
                    338:                                movedown(f+1, l);
                    339:                                f[l--] = mp;
                    340:                                k = j - 1;
                    341:                                goto init;
                    342:                        }
                    343:                emit(output);                           /*(3b)*/
                    344:                mp = f[0];
                    345:                movedown(f+1, j-1);
                    346:                f[j-1] = f[l];
                    347:                f[l] = mp;
                    348:        }
                    349:                
                    350:        fileclose(output, name);
                    351: }
                    352: 
                    353: static int
                    354: fillrec(struct merge *mp)
                    355: {
                    356:        struct rec *r = getline(mp->rec, mp->file);
                    357:        if(r == 0)
                    358:                return IOK;
                    359:        if(r == ENDFILE) {
                    360:                fileclose(mp->file, mp->name);
                    361:                if(mp->del)
                    362:                        unlink(mp->name);
                    363:                return IEOF;
                    364:        }
                    365:        mp->rec = r;
                    366:        return IOK;
                    367: }
                    368: 
                    369: /* 
                    370:    Return the proper index for the extra record f[l]
                    371:    to be inserted before (regardless of -u).
                    372:    Under -u, if the extra record is a duplicate, return
                    373:    the index flagged by IFOL or IPRE according as
                    374:    the insertion position follows or precedes a
                    375:    record of the same value.
                    376: */
                    377: 
                    378: static int
                    379: find(int l, int *flags)
                    380: {
                    381:        int i, t;
                    382:        int bot = 0;
                    383:        int top = l;
                    384:        while(bot < top) {
                    385:                i = (bot+top)/2;
                    386:                t = compare(f[l], f[i]);
                    387:                if(t < 0)
                    388:                        top = i;
                    389:                else if(t > 0)
                    390:                        bot = i + 1;
                    391:                else if(uflag) {
                    392:                        if(f[l]->serial < f[i]->serial) {
                    393:                                *flags = IPRE;
                    394:                                return i;
                    395:                        } else {
                    396:                                *flags = IFOL;
                    397:                                return i + 1;
                    398:                        }
                    399:                }
                    400:                else if(f[l]->serial < f[i]->serial)
                    401:                        top = i;
                    402:                else 
                    403:                        bot = i + 1;
                    404:        }
                    405:        *flags = 0;
                    406:        return top;
                    407: }      
                    408: 
                    409: static int             
                    410: compare(struct merge *mi, struct merge *mj)
                    411: {
                    412:        uchar *ip, *jp;
                    413:        uchar *ei, *ej;
                    414:        uchar *trans, *keep;
                    415:        int li, lj, k;
                    416:        if(simplekeyed) {
                    417:                trans = fields->trans;
                    418:                keep = fields->keep;
                    419:                ip = data(mi->rec);
                    420:                jp = data(mj->rec);
                    421:                ei = ip + mi->rec->dlen;
                    422:                ej = jp + mj->rec->dlen;
                    423:                for( ; ; ip++, jp++) {
                    424:                        while(ip<ei && !keep[*ip]) ip++;
                    425:                        while(jp<ej && !keep[*jp]) jp++;
                    426:                        if(ip>=ei || jp>=ej) break;
                    427:                        k = trans[*ip] - trans[*jp];
                    428:                        if(k != 0) break;
                    429:                }
                    430:                if(ip<ei && jp<ej)
                    431:                        return k*signedrflag;
                    432:                else if(ip < ei)
                    433:                        return signedrflag;
                    434:                else if(jp < ej)
                    435:                        return -signedrflag;
                    436:                else if(sflag)
                    437:                        return 0;
                    438:        } else if(keyed) {
                    439:                ip = key(mi->rec);
                    440:                jp = key(mj->rec);
                    441:                li = mi->rec->klen;
                    442:                lj = mj->rec->klen;
                    443:                for(k=li<lj?li:lj; --k>=0; ip++, jp++)
                    444:                        if(*ip != *jp)
                    445:                                break;
                    446:                if(k < 0) {
                    447:                        if(li != lj)
                    448:                                fatal("theorem disproved","",0);
                    449:                        if(sflag)
                    450:                                return 0;
                    451:                } else
                    452:                        return *ip - *jp;
                    453:                
                    454:        }
                    455:        li = mi->rec->dlen;
                    456:        lj = mj->rec->dlen;
                    457:        ip = data(mi->rec);
                    458:        jp = data(mj->rec);
                    459:        for(k=li<lj?li:lj; --k>=0; ip++, jp++)
                    460:                if(*ip != *jp)
                    461:                        break;
                    462:        return (k<0? li-lj: *ip-*jp)*signedrflag;
                    463: }
                    464: 
                    465: void
                    466: check(char *name)
                    467: {
                    468:        int i, t;
                    469: 
                    470:        mfile = (struct merge*)calloc(2,sizeof(*mfile));
                    471:        recalloc(&mfile[0]);
                    472:        recalloc(&mfile[1]);
                    473:        mfile[0].name = mfile[1].name = name;
                    474:        mfile[0].file = mfile[1].file = fileopen(name, "r");
                    475:        if(mfile[0].file == 0)
                    476:                fatal("can't open ", name, 0);
                    477: 
                    478:        if(fillrec(&mfile[0]) == IEOF)
                    479:                return;
                    480:        for(i=1; fillrec(&mfile[i])!=IEOF; i^=1) {
                    481:                t = compare(&mfile[i^1], &mfile[i]);
                    482:                if(t>0)
                    483:                        syncerr(&mfile[i], disorder);
                    484:                else if(t==0 && uflag)
                    485:                        syncerr(&mfile[i], duplicate);
                    486:        }
                    487: }
                    488: 
                    489: static void
                    490: syncerr(struct merge *m, char *message)
                    491: {
                    492:        int n = m->rec->dlen;
                    493:        warn("trouble in file", m->name, 0);
                    494:        fatal(message, n?(char*)data(m->rec):"", n);
                    495: }
unix.superglobalmegacorp.com
This archive runs on limited infrastructure. Preserving old code on modern bandwidth. Automated agents are requested to crawl responsibly.