[prev in list] [next in list] [prev in thread] [next in thread]
List: openbsd-tech
Subject: Re: sort(1) updates
From: Otto Moerbeek <otto () drijf ! net>
Date: 2014-06-30 9:53:04
Message-ID: 20140630095304.GA6395 () discoboy ! drijf ! net
[Download RAW message or body]
On Sun, Jun 29, 2014 at 06:48:32PM -0400, Jared Yanovich wrote:
> Hi,
>
> sort(1) does some funky things and isn't hard to break:
>
> $ perl -e 'print "\n"x117000,"x\n"' | sort | sort -c
>
> This patch contains a few changes from NetBSD to correct the behavior regarding
> ordering of appending bins to output in certain circumstances which helps pass
> more of our own regress tests and improves performance (e.g. regress suite
> runtime is <40% with new code compared to old/current code on my box). The new
> code is also much easier to understand..
>
> NetBSD logs:
>
> msort.c -r 1.9
> merge(): use array of buffers instead of one big buffer for all records, and
> enlarge them as necessary to read records from merged files; the buffers
> are allocated once per program run, so there shouldn't be any
> performance difference
> This makes sort(1) pass also regression 40B and should make it
> fully arbitrary long record capable.
> XXX the buffer array could probably be freed on end of fmerge() to save memory
>
> fsort.c -r 1.37
> The code that attempted to sort large files by sorting each chunk by the
> first key byte and writing to a temp file, then sorting the records from
> each temp file that had the same first key byte (and repeating for upto
> 4 key bytes) was a nice idea, but completely doomed to failure.
> Eg PR/9308 where a 70MB file has all but one record the same and short keys.
> Not only does the code not work, it is rather guaranteed to be slow.
> Instead always use a merge sort for fully sorted chunk of records (each
> temporary file contains one lot of sorted records).
> The -H option already did this, so just rip out all the code and variables
> that can't be used when -H was specified.
Hi Jared, good to see you active again ;-)
This indeed solves some problems, but I have a test file on which it cores.
The testfile is available at http://www.drijf.net/openbsd/test4.gz
-Otto
>
> Index: append.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/append.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 append.c
> --- append.c 27 Oct 2009 23:59:43 -0000 1.10
> +++ append.c 29 Jun 2014 22:17:16 -0000
> @@ -148,37 +148,3 @@ append(u_char **keylist, int nelem, int
> put(crec, fp);
> }
> }
> -
> -/*
> - * output the already sorted eol bin.
> - */
> -void
> -rd_append(int binno, union f_handle infl0, int nfiles, FILE *outfp,
> - u_char *buffer, u_char *bufend)
> -{
> - RECHEADER *rec;
> -
> - rec = (RECHEADER *) buffer;
> - if (!getnext(binno, infl0, nfiles, (RECHEADER *) buffer, bufend, 0)) {
> - putline(rec, outfp);
> - while (getnext(binno, infl0, nfiles, (RECHEADER *) buffer,
> - bufend, 0) == 0) {
> - if (!UNIQUE)
> - putline(rec, outfp);
> - }
> - }
> -}
> -
> -/*
> - * append plain text--used after sorting the biggest bin.
> - */
> -void
> -concat(FILE *a, FILE *b)
> -{
> - int nread;
> - char buffer[4096];
> -
> - rewind(b);
> - while ((nread = fread(buffer, 1, 4096, b)) > 0)
> - EWRITE(buffer, 1, nread, a);
> -}
> Index: extern.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/extern.h,v
> retrieving revision 1.8
> diff -u -p -r1.8 extern.h
> --- extern.h 22 Dec 2009 19:47:02 -0000 1.8
> +++ extern.h 29 Jun 2014 22:17:16 -0000
> @@ -36,7 +36,6 @@
>
> void append(u_char **, int, int, FILE *, void (*)(RECHEADER *, FILE *),
> struct field *);
> -void concat(FILE *, FILE *);
> length_t enterkey(RECHEADER *, DBT *, int, struct field *);
> void fixit(int *, char **);
> void fldreset(struct field *);
> @@ -44,11 +43,9 @@ FILE *ftmp(void);
> void fmerge(int, union f_handle, int,
> int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
> FILE *, void (*)(RECHEADER *, FILE *), struct field *);
> -void fsort(int, int, union f_handle, int, FILE *, struct field *);
> +void fsort(union f_handle, int, FILE *, struct field *);
> int geteasy(int, union f_handle,
> int, RECHEADER *, u_char *, struct field *);
> -int getnext(int, union f_handle,
> - int, RECHEADER *, u_char *, struct field *);
> int makekey(int, union f_handle,
> int, RECHEADER *, u_char *, struct field *);
> int makeline(int, union f_handle,
> @@ -57,7 +54,6 @@ void merge(int, int,
> int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
> FILE *, void (*)(RECHEADER *, FILE *), struct field *);
> void num_init(void);
> -void onepass(u_char **, int, long, long *, u_char *, FILE *);
> int optval(int, int);
> void order(union f_handle,
> int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
> @@ -65,6 +61,5 @@ void order(union f_handle,
> int c_warn);
> void putline(RECHEADER *, FILE *);
> void putrec(RECHEADER *, FILE *);
> -void rd_append(int, union f_handle, int, FILE *, u_char *, u_char *);
> int setfield(char *, struct field *, int);
> void settables(int);
> Index: files.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/files.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 files.c
> --- files.c 27 Oct 2009 23:59:43 -0000 1.13
> +++ files.c 29 Jun 2014 22:17:16 -0000
> @@ -40,73 +40,6 @@
> static int seq(FILE *, DBT *, DBT *);
>
> /*
> - * this is the subroutine for file management for fsort().
> - * It keeps the buffers for all temporary files.
> - */
> -int
> -getnext(int binno, union f_handle infl0, int nfiles, RECHEADER *pos, u_char *end,
> - struct field *dummy)
> -{
> - int i;
> - u_char *hp;
> - static size_t nleft = 0;
> - static int cnt = 0, flag = -1;
> - static u_char maxb = 0;
> - static FILE *fp;
> -
> - if (nleft == 0) {
> - if (binno < 0) /* reset files. */ {
> - for (i = 0; i < nfiles; i++) {
> - rewind(fstack[infl0.top + i].fp);
> - fstack[infl0.top + i].max_o = 0;
> - }
> - flag = -1;
> - nleft = cnt = 0;
> - return (-1);
> - }
> - maxb = fstack[infl0.top].maxb;
> - for (; nleft == 0; cnt++) {
> - if (cnt >= nfiles) {
> - cnt = 0;
> - return (EOF);
> - }
> - fp = fstack[infl0.top + cnt].fp;
> - fread(&nleft, sizeof(nleft), 1, fp);
> - if (binno < maxb)
> - fstack[infl0.top+cnt].max_o
> - += sizeof(nleft) + nleft;
> - else if (binno == maxb) {
> - if (binno != fstack[infl0.top].lastb) {
> - fseek(fp, fstack[infl0.top+
> - cnt].max_o, SEEK_SET);
> - fread(&nleft, sizeof(nleft), 1, fp);
> - }
> - if (nleft == 0)
> - fclose(fp);
> - } else if (binno == maxb + 1) { /* skip a bin */
> - fseek(fp, nleft, SEEK_CUR);
> - fread(&nleft, sizeof(nleft), 1, fp);
> - flag = cnt;
> - }
> - }
> - }
> - if ((u_char *) pos > end - sizeof(TRECHEADER))
> - return (BUFFEND);
> - fread(pos, sizeof(TRECHEADER), 1, fp);
> - if (end - pos->data < pos->length) {
> - hp = ((u_char *)pos) + sizeof(TRECHEADER);
> - for (i = sizeof(TRECHEADER); i ; i--)
> - ungetc(*--hp, fp);
> - return (BUFFEND);
> - }
> - fread(pos->data, pos->length, 1, fp);
> - nleft -= pos->length + sizeof(TRECHEADER);
> - if (nleft == 0 && binno == fstack[infl0.top].maxb)
> - fclose(fp);
> - return (0);
> -}
> -
> -/*
> * this is called when there is no special key. It's only called
> * in the first fsort pass.
> */
> Index: fsort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 fsort.c
> --- fsort.c 27 Oct 2009 23:59:43 -0000 1.21
> +++ fsort.c 29 Jun 2014 22:17:16 -0000
> @@ -33,11 +33,11 @@
> */
>
> /*
> - * Read in the next bin. If it fits in one segment sort it;
> - * otherwise refine it by segment deeper by one character,
> - * and try again on smaller bins. Sort the final bin at this level
> - * of recursion to keep the head of fstack at 0.
> - * After PANIC passes, abort to merge sort.
> + * Read in a block of records (until 'enough').
> + * sort, write to temp file.
> + * Merge sort temp files into output file
> + * Small files miss out the temp file stage.
> + * Large files might get multiple merges.
> */
> #include "sort.h"
> #include "fsort.h"
> @@ -48,28 +48,26 @@
> u_char *linebuf;
> size_t linebuf_size = MAXLLEN;
> struct tempfile fstack[MAXFCT];
> -#define FSORTMAX 4
> -int PANIC = FSORTMAX;
> +
> +#define CHECKFSTACK(n) \
> + if (n >= MAXFCT) \
> + errx(2, "fstack: too many temporary files; sort in pieces")
>
> void
> -fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
> +fsort(union f_handle infiles, int nfiles, FILE *outfp,
> struct field *ftbl)
> {
> - u_char *weights, **keypos, *bufend, *tmpbuf;
> + u_char *weights, **keypos, *bufend;
> static u_char *buffer, **keylist;
> static size_t bufsize;
> - int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
> + int ntfiles, mfct = 0;
> int c, nelem;
> - long sizes[NBINS+1];
> - union f_handle tfiles, mstart = {MAXFCT-16};
> + union f_handle tfiles, mstart = { MAXFCT - MERGE_FNUM };
> int (*get)(int, union f_handle, int, RECHEADER *,
> u_char *, struct field *);
> RECHEADER *crec;
> struct field tfield[2];
> - FILE *prevfp, *tailfp[FSORTMAX+1];
>
> - memset(tailfp, 0, sizeof(tailfp));
> - prevfp = outfp;
> memset(tfield, 0, sizeof(tfield));
> if (ftbl[0].flags & R)
> tfield[0].weights = Rascii;
> @@ -84,205 +82,114 @@ fsort(int binno, int depth, union f_hand
> err(2, NULL);
> }
> bufend = buffer + bufsize - 1;
> - if (binno >= 0) {
> - tfiles.top = infiles.top + nfiles;
> - get = getnext;
> - } else {
> - tfiles.top = 0;
> - if (SINGL_FLD)
> - get = makeline;
> - else
> - get = makekey;
> - }
> - for (;;) {
> - memset(sizes, 0, sizeof(sizes));
> - c = ntfiles = 0;
> - if (binno == weights[REC_D] &&
> - !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
> - rd_append(weights[REC_D],
> - infiles, nfiles, prevfp, buffer, bufend);
> - break;
> - } else if (binno == weights[REC_D]) {
> - depth = 0; /* start over on flat weights */
> - ftbl = tfield;
> - weights = ftbl[0].weights;
> +
> + if (SINGL_FLD)
> + get = makeline;
> + else
> + get = makekey;
> +
> + c = nelem = ntfiles = 0;
> + keypos = keylist;
> + crec = (RECHEADER *) buffer;
> + while (c != EOF) {
> + keypos = keylist;
> + nelem = 0;
> + crec = (RECHEADER *) buffer;
> +
> + do_read:
> + while ((c = get(-1, infiles, nfiles, crec, bufend,
> + ftbl)) == 0) {
> + *keypos++ = crec->data;
> + if (++nelem == MAXNUM) {
> + c = BUFFEND;
> + break;
> + }
> + crec = (RECHEADER *)((char *) crec +
> + SALIGN(crec->length) + sizeof(TRECHEADER));
> }
> - while (c != EOF) {
> - keypos = keylist;
> - nelem = 0;
> - crec = (RECHEADER *) buffer;
> - while ((c = get(binno, infiles, nfiles, crec, bufend,
> - ftbl)) == 0) {
> - *keypos++ = crec->data + depth;
> - if (++nelem == MAXNUM) {
> - c = BUFFEND;
> - break;
> - }
> - crec = (RECHEADER *)((char *)crec +
> - SALIGN(crec->length) + sizeof(TRECHEADER));
> + if (c == BUFFEND && nelem < MAXNUM &&
> + bufsize < MAXBUFSIZE) {
> + u_char **keyp;
> + u_char *oldb = buffer;
> + u_char *nbuffer;
> +
> + /* buffer was too small for data, allocate
> + * bigger buffer */
> + nbuffer = realloc(buffer, bufsize * 2);
> + if (!nbuffer) {
> + err(2, "failed to realloc buffer to %lu bytes",
> + (unsigned long) bufsize * 2);
> }
> + buffer = nbuffer;
> + bufsize *= 2;
> + bufend = buffer + bufsize;
> +
> + /* patch up keylist[] */
> + for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
> + *keyp = buffer + (*keyp - oldb);
> +
> + crec = (RECHEADER *) (buffer +
> + ((u_char *)crec - oldb));
> + goto do_read;
> + }
> + if (c != BUFFEND && !ntfiles && !mfct) {
> + /* do not push */
> + continue;
> + }
> +
> + /* push */
> + fstack[mstart.top + mfct].fp = ftmp();
> + if (radix_sort((const u_char **)keylist, nelem, weights,
> + REC_D))
> + err(2, NULL);
> + append(keylist, nelem, 0, fstack[mstart.top + mfct].fp,
> + putrec, ftbl);
> + mfct++;
> +
> + /* reduce number of open files */
> + if (mfct == MERGE_FNUM || (c == EOF && ntfiles)) {
> /*
> - * buffer was too small for data, allocate
> - * a bigger buffer.
> + * Only copy extra incomplete crec
> + * data if there are any.
> */
> - if (c == BUFFEND && nelem == 0) {
> - bufsize *= 2;
> - tmpbuf = realloc(buffer, bufsize);
> - if (!tmpbuf)
> - err(2, "failed to realloc buffer");
> - crec = (RECHEADER *)
> - (tmpbuf + ((u_char *)crec - buffer));
> - buffer = tmpbuf;
> - bufend = buffer + bufsize - 1;
> - continue;
> + int nodata = (bufend >= (u_char *)crec &&
> + bufend <= crec->data);
> + size_t sz = 0;
> + u_char *tmpbuf = NULL;
> +
> + if (!nodata) {
> + sz = bufend - crec->data;
> + tmpbuf = malloc(sz);
> + memmove(tmpbuf, crec->data, sz);
> }
> - if (c == BUFFEND || ntfiles || mfct) { /* push */
> - if (panic >= PANIC) {
> - fstack[MAXFCT-16+mfct].fp = ftmp();
> - if (radixsort((const u_char **)keylist,
> - nelem, weights, REC_D))
> - err(2, NULL);
> - append(keylist, nelem, depth, fstack[
> - MAXFCT-16+mfct].fp, putrec, ftbl);
> - mfct++;
> - /* reduce number of open files */
> - if (mfct == 16 ||(c == EOF && ntfiles)) {
> - fstack[tfiles.top + ntfiles].fp
> - = ftmp();
> - fmerge(0, mstart, mfct, geteasy,
> - fstack[tfiles.top+ntfiles].fp,
> - putrec, ftbl);
> - ntfiles++;
> - mfct = 0;
> - }
> - } else {
> - fstack[tfiles.top + ntfiles].fp= ftmp();
> - onepass(keylist, depth, nelem, sizes,
> - weights, fstack[tfiles.top+ntfiles].fp);
> - ntfiles++;
> - }
> + CHECKFSTACK(ntfiles);
> + fstack[ntfiles].fp = ftmp();
> + fmerge(0, mstart, mfct, geteasy,
> + fstack[ntfiles].fp, putrec, ftbl);
> + ntfiles++;
> + mfct = 0;
> +
> + if (!nodata) {
> + memmove(crec->data, tmpbuf, sz);
> + free(tmpbuf);
> }
> }
> - get = getnext;
> - if (!ntfiles && !mfct) { /* everything in memory--pop */
> - if (nelem > 1) {
> - if (STABLE) {
> - i = sradixsort((const u_char **)keylist,
> - nelem, weights, REC_D);
> - } else {
> - i = radixsort((const u_char **)keylist,
> - nelem, weights, REC_D);
> - }
> - if (i)
> - err(2, NULL);
> - }
> - append(keylist, nelem, depth, outfp, putline, ftbl);
> - break; /* pop */
> - }
> - if (panic >= PANIC) {
> - if (!ntfiles)
> - fmerge(0, mstart, mfct, geteasy,
> - outfp, putline, ftbl);
> - else
> - fmerge(0, tfiles, ntfiles, geteasy,
> - outfp, putline, ftbl);
> - break;
> - }
> - total = maxb = lastb = 0; /* find if one bin dominates */
> - for (i = 0; i < NBINS; i++)
> - if (sizes[i]) {
> - if (sizes[i] > sizes[maxb])
> - maxb = i;
> - lastb = i;
> - total += sizes[i];
> - }
> - if (sizes[maxb] < max((total / 2) , BUFSIZE))
> - maxb = lastb; /* otherwise pop after last bin */
> - fstack[tfiles.top].lastb = lastb;
> - fstack[tfiles.top].maxb = maxb;
> -
> - /* start refining next level. */
> - get(-1, tfiles, ntfiles, crec, bufend, 0); /* rewind */
> - for (i = 0; i < maxb; i++) {
> - if (!sizes[i]) /* bin empty; step ahead file offset */
> - get(i, tfiles, ntfiles, crec, bufend, 0);
> - else
> - fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
> - }
> - if (lastb != maxb) {
> - if (prevfp != outfp)
> - tailfp[panic] = prevfp;
> - prevfp = ftmp();
> - for (i = maxb+1; i <= lastb; i++)
> - if (!sizes[i])
> - get(i, tfiles, ntfiles, crec, bufend,0);
> - else
> - fsort(i, depth+1, tfiles, ntfiles,
> - prevfp, ftbl);
> - }
> -
> - /* sort biggest (or last) bin at this level */
> - depth++;
> - panic++;
> - binno = maxb;
> - infiles.top = tfiles.top; /* getnext will free tfiles, */
> - nfiles = ntfiles; /* so overwrite them */
> - }
> - if (prevfp != outfp) {
> - concat(outfp, prevfp);
> - fclose(prevfp);
> }
> - for (i = panic; i >= 0; --i)
> - if (tailfp[i]) {
> - concat(outfp, tailfp[i]);
> - fclose(tailfp[i]);
> - }
> -}
> -
> -/*
> - * This is one pass of radix exchange, dumping the bins to disk.
> - */
> -#define swap(a, b, t) t = a, a = b, b = t
> -void
> -onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
> -{
> - size_t tsizes[NBINS+1];
> - u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
> - static int histo[256];
> - int *hp;
> - int c;
> - u_char **an, *t, **aj;
> - u_char **ak, *r;
> -
> - memset(tsizes, 0, sizeof(tsizes));
> - depth += sizeof(TRECHEADER);
> - an = &a[n];
> - for (ak = a; ak < an; ak++) {
> - histo[c = tr[**ak]]++;
> - tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
> + if (!ntfiles && !mfct) { /* everything in memory--pop */
> + if (nelem > 1 && radix_sort((const u_char **)keylist,
> + nelem, weights, REC_D))
> + err(2, NULL);
> + if (nelem > 0)
> + append(keylist, nelem, 0, outfp, putline, ftbl);
> }
> + if (!ntfiles)
> + fmerge(0, mstart, mfct, geteasy, outfp, putline, ftbl);
> + else
> + fmerge(0, tfiles, ntfiles, geteasy, outfp, putline,
> + ftbl);
>
> - bin[0] = a;
> - bpmax = bin + 256;
> - tp = top, hp = histo;
> - for (bp = bin; bp < bpmax; bp++) {
> - *tp++ = *(bp + 1) = *bp + (c = *hp);
> - *hp++ = 0;
> - if (c <= 1)
> - continue;
> - }
> - for (aj = a; aj < an; *aj = r, aj = bin[c + 1])
> - for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]); )
> - swap(*ak, r, t);
> -
> - for (ak = a, c = 0; c < 256; c++) {
> - an = bin[c + 1];
> - n = an - ak;
> - tsizes[c] += n * sizeof(TRECHEADER);
> - /* tell getnext how many elements in this bin, this segment. */
> - EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
> - sizes[c] += tsizes[c];
> - for (; ak < an; ++ak)
> - putrec((RECHEADER *) *ak, fp);
> - }
> + free(keylist);
> + keylist = NULL;
> + free(buffer);
> + buffer = NULL;
> }
> Index: fsort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.h,v
> retrieving revision 1.10
> diff -u -p -r1.10 fsort.h
> --- fsort.h 13 Mar 2007 17:33:58 -0000 1.10
> +++ fsort.h 29 Jun 2014 22:17:16 -0000
> @@ -38,9 +38,17 @@
> #define BUFSIZE (1 << POW)
> #define MAXNUM (BUFSIZE/10) /* lowish guess at average record size */
> #define BUFFEND (EOF-2)
> -#define BUFFSMALL (EOF-3) /* buffer is too small to hold line */
> #define MAXFCT 1000
> #define MAXLLEN ((1 << min(POW-4, 16)) - 14)
> +#define MERGE_FNUM 16
> +
> +/*
> + * Default (initial) and maximum size of record buffer for fsort().
> + * Note that no more than MAXNUM records are stored in the buffer,
> + * even if the buffer is not full yet.
> + */
> +#define DEFBUFSIZE (1 << 20) /* 1MB */
> +#define MAXBUFSIZE (8 << 20) /* 10 MB */
>
> extern u_char *linebuf;
> extern size_t linebuf_size;
> Index: msort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/msort.c,v
> retrieving revision 1.24
> diff -u -p -r1.24 msort.c
> --- msort.c 13 Nov 2013 15:07:27 -0000 1.24
> +++ msort.c 29 Jun 2014 22:17:16 -0000
> @@ -65,6 +65,7 @@ fmerge(int binno, union f_handle files,
> int i, j, last;
> void (*put)(RECHEADER *, FILE *);
> struct tempfile *l_fstack;
> + size_t tbufsiz;
>
> wts = ftbl->weights;
> if (!UNIQUE && SINGL_FLD && (ftbl->flags & F))
> @@ -72,12 +73,12 @@ fmerge(int binno, union f_handle files,
> if (cfilebuf == NULL) {
> cfilebuf = malloc(MAXLLEN + sizeof(MFILE));
> if (cfilebuf == NULL)
> - errx(2, "cannot allocate memory");
> + errx(2, NULL);
> }
>
> - i = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> - if (i > bufsize) {
> - bufsize = i;
> + tbufsiz = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> + if (tbufsiz > bufsize) {
> + bufsize = tbufsiz;
> if ((buffer = realloc(buffer, bufsize)) == NULL)
> err(2, NULL);
> }
> @@ -127,22 +128,54 @@ merge(int infl0, int nfiles,
> int (*get)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
> FILE *outfp, void (*put)(RECHEADER *, FILE *), struct field *ftbl)
> {
> - int c, i, j;
> + int c, i, j, nf = nfiles;
> union f_handle dummy = {0};
> - struct mfile *flist[16], *cfile;
> + struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
> + size_t availsz = bufsize;
> + static void *bufs[MERGE_FNUM+1];
> + static size_t bufs_sz[MERGE_FNUM+1];
> +
> + /*
> + * We need nfiles + 1 buffers. One is 'buffer', the
> + * rest needs to be allocated.
> + */
> + bufs[0] = buffer;
> + bufs_sz[0] = bufsize;
> + for(i=1; i < nfiles+1; i++) {
> + if (bufs[i])
> + continue;
> +
> + bufs[i] = malloc(MAXLLEN);
> + if (!bufs[i])
> + err(2, NULL);
> + bufs_sz[i] = MAXLLEN;
> + }
>
> for (i = j = 0; i < nfiles; i++) {
> - cfile = (MFILE *) (buffer +
> - i * LALIGN(MAXLLEN + sizeof(MFILE)));
> - cfile->flno = j + infl0;
> - cfile->end = cfile->rec->data + MAXLLEN;
> + cfile = (struct mfile *) bufs[j];
> + cfile->flno = infl0 + j;
> + cfile->end = (u_char *) bufs[j] + bufs_sz[j];
> for (c = 1; c == 1;) {
> - if (EOF == (c = get(j+infl0, dummy, nfiles,
> + if (EOF == (c = get(cfile->flno, dummy, nfiles,
> cfile->rec, cfile->end, ftbl))) {
> i--;
> nfiles--;
> break;
> }
> +
> + if (c == BUFFEND) {
> + cfile = realloc(bufs[j], bufs_sz[j] *= 2);
> + bufs[j] = (void *)cfile;
> +
> + if (!cfile)
> + err(2, NULL);
> +
> + cfile->end = (u_char *)cfile + bufs_sz[j];
> +
> + c = 1;
> + continue;
> + }
> +
> if (i)
> c = insert(flist, &cfile, i, !DELETE);
> else
> @@ -151,24 +184,49 @@ merge(int infl0, int nfiles,
> j++;
> }
> if (nfiles > 0) {
> - cfile = cfilebuf;
> + cfile = (struct mfile *) bufs[nf];
> cfile->flno = flist[0]->flno;
> - cfile->end = cfile->rec->data + MAXLLEN;
> + cfile->end = (u_char *) cfile + bufs_sz[nf];
> while (nfiles) {
> for (c = 1; c == 1;) {
> if (EOF == (c = get(cfile->flno, dummy, nfiles,
> cfile->rec, cfile->end, ftbl))) {
> put(flist[0]->rec, outfp);
> - memmove(flist, flist + 1,
> - sizeof(MFILE *) * (--nfiles));
> - cfile->flno = flist[0]->flno;
> + if (--nfiles > 0) {
> + flist++;
> + cfile->flno = flist[0]->flno;
> + }
> break;
> }
> + if (c == BUFFEND) {
> + char *oldbuf = (char *)cfile;
> + availsz = (char *)cfile->end - oldbuf;
> + availsz *= 2;
> + cfile = realloc(oldbuf, availsz);
> + if (!cfile)
> + err(2, NULL);
> +
> + for (i = 0; i < nf + 1; i++) {
> + if (bufs[i] == oldbuf) {
> + bufs[i] = (char *)cfile;
> + bufs_sz[i] = availsz;
> + break;
> + }
> + }
> +
> + cfile->end = (u_char *)cfile + availsz;
> + c = 1;
> + continue;
> + }
> if (!(c = insert(flist, &cfile, nfiles, DELETE)))
> put(cfile->rec, outfp);
> }
> - }
> - }
> + }
> + if (bufs_sz[0] > bufsize) {
> + buffer = bufs[0];
> + bufsize = bufs_sz[0];
> + }
> + }
> }
>
> /*
> Index: sort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 sort.c
> --- sort.c 13 Nov 2013 15:07:27 -0000 1.41
> +++ sort.c 29 Jun 2014 22:17:16 -0000
> @@ -44,13 +44,14 @@
>
> #include <sys/types.h>
> #include <sys/stat.h>
> +
> +#include <err.h>
> #include <locale.h>
> #include <paths.h>
> #include <signal.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> -#include <err.h>
>
> int REC_D = '\n';
> u_char d_mask[NBINS]; /* flags for rec_d, field_d, <blank> */
> @@ -71,6 +72,8 @@ struct coldesc *clist;
> int ncols = 0;
> int ND = 10; /* limit on number of -k options. */
>
> +int (*radix_sort)(const u_char **, int, const u_char *, u_int) = radixsort;
> +
> char *devstdin = _PATH_STDIN;
> char *tmpdir = _PATH_VARTMP;
> char toutpath[PATH_MAX];
> @@ -172,7 +175,6 @@ main(int argc, char *argv[])
> mflag = 1;
> break;
> case 'H':
> - PANIC = 0;
> break;
> case 'y':
> /* accept -y for backwards compat. */
> @@ -186,6 +188,7 @@ main(int argc, char *argv[])
> break;
> case 's':
> STABLE = 1;
> + radix_sort = sradixsort;
> break;
> case '?':
> default:
> @@ -292,8 +295,8 @@ main(int argc, char *argv[])
> act.sa_handler = onsig;
> for (i = 0; sigtable[i]; ++i) /* always unlink toutpath */
> if (sigaction(sigtable[i], NULL, &oact) < 0 ||
> - oact.sa_handler != SIG_IGN &&
> - sigaction(sigtable[i], &act, NULL) < 0)
> + (oact.sa_handler != SIG_IGN &&
> + sigaction(sigtable[i], &act, NULL) < 0))
> err(2, "sigaction");
> } else
> outfile = outpath;
> @@ -302,7 +305,7 @@ main(int argc, char *argv[])
> if (mflag)
> fmerge(-1, filelist, argc-optind, get, outfp, putline, fldtab);
> else
> - fsort(-1, 0, filelist, argc-optind, outfp, fldtab);
> + fsort(filelist, argc-optind, outfp, fldtab);
> if (outfile != outpath) {
> if (access(outfile, 0))
> err(2, "%s", outfile);
> Index: sort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.h,v
> retrieving revision 1.7
> diff -u -p -r1.7 sort.h
> --- sort.h 21 Aug 2007 20:29:25 -0000 1.7
> +++ sort.h 29 Jun 2014 22:17:16 -0000
> @@ -129,13 +129,14 @@ union f_handle {
> int top;
> char **names;
> };
> -extern int PANIC; /* maximum depth of fsort before fmerge is called */
> +
> extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
> extern u_char alltable[NBINS], dtable[NBINS], itable[NBINS];
> extern u_char d_mask[NBINS];
> extern int SINGL_FLD, SEP_FLAG, UNIQUE, STABLE;
> extern int REC_D;
> extern char *tmpdir;
> +extern int (*radix_sort)(const u_char **, int, const u_char *, u_int);
> extern int ND; /* limit on number of -k options. */
>
> #include "extern.h"
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic