[prev in list] [next in list] [prev in thread] [next in thread] 

List:       openbsd-tech
Subject:    Re: sort(1) updates
From:       Otto Moerbeek <otto () drijf ! net>
Date:       2014-06-30 9:53:04
Message-ID: 20140630095304.GA6395 () discoboy ! drijf ! net
[Download RAW message or body]

On Sun, Jun 29, 2014 at 06:48:32PM -0400, Jared Yanovich wrote:

> Hi,
> 
> sort(1) does some funky things and isn't hard to break:
> 
>   $ perl -e 'print "\n"x117000,"x\n"' | sort | sort -c
> 
> This patch contains a few changes from NetBSD to correct the behavior regarding
> ordering of appending bins to output in certain circumstances which helps pass
> more of our own regress tests and improves performance (e.g. regress suite
> runtime is <40% with new code compared to old/current code on my box).  The new
> code is also much easier to understand..
> 
> NetBSD logs:
> 
>   msort.c -r 1.9
>     merge(): use array of buffers instead of one big buffer for all records, and
>     	     enlarge them as necessary to read records from merged files; the buffers
> 	     are allocated once per program run, so there shouldn't be any
> 	     performance difference
>     This makes sort(1) pass also regression 40B and should make it
>     fully arbitrary long record capable.
>     XXX the buffer array could probably be freed on end of fmerge() to save memory
> 
>   fsort.c -r 1.37
>     The code that attempted to sort large files by sorting each chunk by the
>     first key byte and writing to a temp file, then sorting the records from
>     each temp file that had the same first key byte (and repeating for upto
>     4 key bytes) was a nice idea, but completely doomed to failure.
>     Eg PR/9308 where a 70MB file has all but one record the same and short keys.
>     Not only does the code not work, it is rather guaranteed to be slow.
>     Instead always use a merge sort for fully sorted chunk of records (each
>     temporary file contains one lot of sorted records).
>     The -H option already did this, so just rip out all the code and variables
>     that can't be used when -H was specified.

Hi Jared, good to see you active again ;-)

This indeed solves some problems, but I have a test file on which it cores.

The testfile is available at http://www.drijf.net/openbsd/test4.gz

	-Otto


> 
> Index: append.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/append.c,v
> retrieving revision 1.10
> diff -u -p -r1.10 append.c
> --- append.c	27 Oct 2009 23:59:43 -0000	1.10
> +++ append.c	29 Jun 2014 22:17:16 -0000
> @@ -148,37 +148,3 @@ append(u_char **keylist, int nelem, int 
>  		put(crec, fp);
>  	}
>  }
> -
> -/*
> - * output the already sorted eol bin.
> - */
> -void
> -rd_append(int binno, union f_handle infl0, int nfiles, FILE *outfp,
> -    u_char *buffer, u_char *bufend)
> -{
> -	RECHEADER *rec;
> -
> -	rec = (RECHEADER *) buffer;
> -	if (!getnext(binno, infl0, nfiles, (RECHEADER *) buffer, bufend, 0)) {
> -		putline(rec, outfp);
> -		while (getnext(binno, infl0, nfiles, (RECHEADER *) buffer,
> -			bufend, 0) == 0) {
> -			if (!UNIQUE)
> -				putline(rec, outfp);
> -		}
> -	}
> -}
> -
> -/*
> - * append plain text--used after sorting the biggest bin.
> - */
> -void
> -concat(FILE *a, FILE *b)
> -{
> -        int nread;
> -        char buffer[4096];
> -
> -	rewind(b);
> -        while ((nread = fread(buffer, 1, 4096, b)) > 0)
> -                EWRITE(buffer, 1, nread, a);
> -}
> Index: extern.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/extern.h,v
> retrieving revision 1.8
> diff -u -p -r1.8 extern.h
> --- extern.h	22 Dec 2009 19:47:02 -0000	1.8
> +++ extern.h	29 Jun 2014 22:17:16 -0000
> @@ -36,7 +36,6 @@
>  
>  void	 append(u_char **, int, int, FILE *, void (*)(RECHEADER *, FILE *),
>  	    struct field *);
> -void	 concat(FILE *, FILE *);
>  length_t enterkey(RECHEADER *, DBT *, int, struct field *);
>  void	 fixit(int *, char **);
>  void	 fldreset(struct field *);
> @@ -44,11 +43,9 @@ FILE	*ftmp(void);
>  void	 fmerge(int, union f_handle, int,
>  	    int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
>  	    FILE *, void (*)(RECHEADER *, FILE *), struct field *);
> -void	 fsort(int, int, union f_handle, int, FILE *, struct field *);
> +void	 fsort(union f_handle, int, FILE *, struct field *);
>  int	 geteasy(int, union f_handle,
>  	    int, RECHEADER *, u_char *, struct field *);
> -int	 getnext(int, union f_handle,
> -	    int, RECHEADER *, u_char *, struct field *);
>  int	 makekey(int, union f_handle,
>  	    int, RECHEADER *, u_char *, struct field *);
>  int	 makeline(int, union f_handle,
> @@ -57,7 +54,6 @@ void	 merge(int, int,
>  	    int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
>  	    FILE *, void (*)(RECHEADER *, FILE *), struct field *);
>  void	 num_init(void);
> -void	 onepass(u_char **, int, long, long *, u_char *, FILE *);
>  int	 optval(int, int);
>  void	 order(union f_handle,
>  	    int (*)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
> @@ -65,6 +61,5 @@ void	 order(union f_handle,
>  	    int c_warn);
>  void	 putline(RECHEADER *, FILE *);
>  void	 putrec(RECHEADER *, FILE *);
> -void	 rd_append(int, union f_handle, int, FILE *, u_char *, u_char *);
>  int	 setfield(char *, struct field *, int);
>  void	 settables(int);
> Index: files.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/files.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 files.c
> --- files.c	27 Oct 2009 23:59:43 -0000	1.13
> +++ files.c	29 Jun 2014 22:17:16 -0000
> @@ -40,73 +40,6 @@
>  static int	seq(FILE *, DBT *, DBT *);
>  
>  /*
> - * this is the subroutine for file management for fsort().
> - * It keeps the buffers for all temporary files.
> - */
> -int
> -getnext(int binno, union f_handle infl0, int nfiles, RECHEADER *pos, u_char *end,
> -    struct field *dummy)
> -{
> -	int i;
> -	u_char *hp;
> -	static size_t nleft = 0;
> -	static int cnt = 0, flag = -1;
> -	static u_char maxb = 0;
> -	static FILE *fp;
> -
> -	if (nleft == 0) {
> -		if (binno < 0)	/* reset files. */ {
> -			for (i = 0; i < nfiles; i++) {
> -				rewind(fstack[infl0.top + i].fp);
> -				fstack[infl0.top + i].max_o = 0;
> -			}
> -			flag = -1;
> -			nleft = cnt = 0;
> -			return (-1);
> -		}
> -		maxb = fstack[infl0.top].maxb;
> -		for (; nleft == 0; cnt++) {
> -			if (cnt >= nfiles) {
> -				cnt = 0;
> -				return (EOF);
> -			}
> -			fp = fstack[infl0.top + cnt].fp;
> -			fread(&nleft, sizeof(nleft), 1, fp);
> -			if (binno < maxb)
> -				fstack[infl0.top+cnt].max_o
> -					+= sizeof(nleft) + nleft;
> -			else if (binno == maxb) {
> -				if (binno != fstack[infl0.top].lastb) {
> -					fseek(fp, fstack[infl0.top+
> -						cnt].max_o, SEEK_SET);
> -					fread(&nleft, sizeof(nleft), 1, fp);
> -				}
> -				if (nleft == 0)
> -					fclose(fp);
> -			} else if (binno == maxb + 1) {		/* skip a bin */
> -				fseek(fp, nleft, SEEK_CUR);
> -				fread(&nleft, sizeof(nleft), 1, fp);
> -				flag = cnt;
> -			}
> -		}
> -	}
> -	if ((u_char *) pos > end - sizeof(TRECHEADER))
> -		return (BUFFEND);
> -	fread(pos, sizeof(TRECHEADER), 1, fp);
> -	if (end - pos->data < pos->length) {
> -		hp = ((u_char *)pos) + sizeof(TRECHEADER);
> -		for (i = sizeof(TRECHEADER); i ;  i--)
> -			ungetc(*--hp, fp);
> -		return (BUFFEND);
> -	}
> -	fread(pos->data, pos->length, 1, fp);
> -	nleft -= pos->length + sizeof(TRECHEADER);
> -	if (nleft == 0 && binno == fstack[infl0.top].maxb)
> -		fclose(fp);
> -	return (0);
> -}
> -
> -/*
>   * this is called when there is no special key. It's only called
>   * in the first fsort pass.
>   */
> Index: fsort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 fsort.c
> --- fsort.c	27 Oct 2009 23:59:43 -0000	1.21
> +++ fsort.c	29 Jun 2014 22:17:16 -0000
> @@ -33,11 +33,11 @@
>   */
>  
>  /*
> - * Read in the next bin.  If it fits in one segment sort it;
> - * otherwise refine it by segment deeper by one character,
> - * and try again on smaller bins.  Sort the final bin at this level
> - * of recursion to keep the head of fstack at 0.
> - * After PANIC passes, abort to merge sort.
> + * Read in a block of records (until 'enough').
> + * sort, write to temp file.
> + * Merge sort temp files into output file
> + * Small files miss out the temp file stage.
> + * Large files might get multiple merges.
>   */
>  #include "sort.h"
>  #include "fsort.h"
> @@ -48,28 +48,26 @@
>  u_char *linebuf;
>  size_t linebuf_size = MAXLLEN;
>  struct tempfile fstack[MAXFCT];
> -#define FSORTMAX 4
> -int PANIC = FSORTMAX;
> +
> +#define CHECKFSTACK(n)					\
> +	if (n >= MAXFCT)				\
> +		errx(2, "fstack: too many temporary files; sort in pieces")
>  
>  void
> -fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
> +fsort(union f_handle infiles, int nfiles, FILE *outfp,
>      struct field *ftbl)
>  {
> -	u_char *weights, **keypos, *bufend, *tmpbuf;
> +	u_char *weights, **keypos, *bufend;
>  	static u_char *buffer, **keylist;
>  	static size_t bufsize;
> -	int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
> +	int ntfiles, mfct = 0;
>  	int c, nelem;
> -	long sizes[NBINS+1];
> -	union f_handle tfiles, mstart = {MAXFCT-16};
> +	union f_handle tfiles, mstart = { MAXFCT - MERGE_FNUM };
>  	int (*get)(int, union f_handle, int, RECHEADER *,
>  		u_char *, struct field *);
>  	RECHEADER *crec;
>  	struct field tfield[2];
> -	FILE *prevfp, *tailfp[FSORTMAX+1];
>  
> -	memset(tailfp, 0, sizeof(tailfp));
> -	prevfp = outfp;
>  	memset(tfield, 0, sizeof(tfield));
>  	if (ftbl[0].flags & R)
>  		tfield[0].weights = Rascii;
> @@ -84,205 +82,114 @@ fsort(int binno, int depth, union f_hand
>  			err(2, NULL);
>  	}
>  	bufend = buffer + bufsize - 1;
> -	if (binno >= 0) {
> -		tfiles.top = infiles.top + nfiles;
> -		get = getnext;
> -	} else {
> -		tfiles.top = 0;
> -		if (SINGL_FLD)
> -			get = makeline;
> -		else
> -			get = makekey;
> -	}				
> -	for (;;) {
> -		memset(sizes, 0, sizeof(sizes));
> -		c = ntfiles = 0;
> -		if (binno == weights[REC_D] &&
> -		    !(SINGL_FLD && ftbl[0].flags & F)) {	/* pop */
> -			rd_append(weights[REC_D],
> -			    infiles, nfiles, prevfp, buffer, bufend);
> -			break;
> -		} else if (binno == weights[REC_D]) {
> -			depth = 0;		/* start over on flat weights */
> -			ftbl = tfield;
> -			weights = ftbl[0].weights;
> +
> +	if (SINGL_FLD)
> +		get = makeline;
> +	else
> +		get = makekey;
> +
> +	c = nelem = ntfiles = 0;
> +	keypos = keylist;
> +	crec = (RECHEADER *) buffer;
> +	while (c != EOF) {
> +		keypos = keylist;
> +		nelem = 0;
> +		crec = (RECHEADER *) buffer;
> +
> + do_read:
> +		while ((c = get(-1, infiles, nfiles, crec, bufend,
> +		    ftbl)) == 0) {
> +			*keypos++ = crec->data;
> +			if (++nelem == MAXNUM) {
> +				c = BUFFEND;
> +				break;
> +			}
> +			crec = (RECHEADER *)((char *) crec +
> +			    SALIGN(crec->length) + sizeof(TRECHEADER));
>  		}
> -		while (c != EOF) {
> -			keypos = keylist;
> -			nelem = 0;
> -			crec = (RECHEADER *) buffer;
> -			while ((c = get(binno, infiles, nfiles, crec, bufend,
> -			    ftbl)) == 0) {
> -				*keypos++ = crec->data + depth;
> -				if (++nelem == MAXNUM) {
> -					c = BUFFEND;
> -					break;
> -				}
> -				crec = (RECHEADER *)((char *)crec +
> -				    SALIGN(crec->length) + sizeof(TRECHEADER));
> +		if (c == BUFFEND && nelem < MAXNUM &&
> +		    bufsize < MAXBUFSIZE) {
> +			u_char **keyp;
> +			u_char *oldb = buffer;
> +			u_char *nbuffer;
> +
> +			/* buffer was too small for data, allocate
> +			 * bigger buffer */
> +			nbuffer = realloc(buffer, bufsize * 2);
> +			if (!nbuffer) {
> +				err(2, "failed to realloc buffer to %lu bytes",
> +				    (unsigned long) bufsize * 2);
>  			}
> +			buffer = nbuffer;
> +			bufsize *= 2;
> +			bufend = buffer + bufsize;
> +
> +			/* patch up keylist[] */
> +			for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
> +				*keyp = buffer + (*keyp - oldb);
> +
> +			crec = (RECHEADER *) (buffer +
> +			    ((u_char *)crec - oldb));
> +			goto do_read;
> +		}
> +		if (c != BUFFEND && !ntfiles && !mfct) {
> +			/* do not push */
> +			continue;
> +		}
> +
> +		/* push */
> +		fstack[mstart.top + mfct].fp = ftmp();
> +		if (radix_sort((const u_char **)keylist, nelem, weights,
> +		    REC_D))
> +			err(2, NULL);
> +		append(keylist, nelem, 0, fstack[mstart.top + mfct].fp,
> +		    putrec, ftbl);
> +		mfct++;
> +
> +		/* reduce number of open files */
> +		if (mfct == MERGE_FNUM || (c == EOF && ntfiles)) {
>  			/*
> -			 * buffer was too small for data, allocate
> -			 * a bigger buffer.
> +			 * Only copy extra incomplete crec
> +			 * data if there are any.
>  			 */
> -			if (c == BUFFEND && nelem == 0) {
> -				bufsize *= 2;
> -				tmpbuf = realloc(buffer, bufsize);
> -				if (!tmpbuf)
> -					err(2, "failed to realloc buffer");
> -				crec = (RECHEADER *)
> -				    (tmpbuf + ((u_char *)crec - buffer));
> -				buffer = tmpbuf;
> -				bufend = buffer + bufsize - 1;
> -				continue;
> +			int nodata = (bufend >= (u_char *)crec &&
> +			    bufend <= crec->data);
> +			size_t sz = 0;
> +			u_char *tmpbuf = NULL;
> +
> +			if (!nodata) {
> +				sz = bufend - crec->data;
> +				tmpbuf = malloc(sz);
> +				memmove(tmpbuf, crec->data, sz);
>  			}
> -			if (c == BUFFEND || ntfiles || mfct) {	/* push */
> -				if (panic >= PANIC) {
> -					fstack[MAXFCT-16+mfct].fp = ftmp();
> -					if (radixsort((const u_char **)keylist,
> -					    nelem, weights, REC_D))
> -						err(2, NULL);
> -					append(keylist, nelem, depth, fstack[
> -					 MAXFCT-16+mfct].fp, putrec, ftbl);
> -					mfct++;
> -					/* reduce number of open files */
> -					if (mfct == 16 ||(c == EOF && ntfiles)) {
> -						fstack[tfiles.top + ntfiles].fp
> -						    = ftmp();
> -						fmerge(0, mstart, mfct, geteasy,
> -						  fstack[tfiles.top+ntfiles].fp,
> -						  putrec, ftbl);
> -						ntfiles++;
> -						mfct = 0;
> -					}
> -				} else {
> -					fstack[tfiles.top + ntfiles].fp= ftmp();
> -					onepass(keylist, depth, nelem, sizes,
> -					weights, fstack[tfiles.top+ntfiles].fp);
> -					ntfiles++;
> -				}
> +			CHECKFSTACK(ntfiles);
> +			fstack[ntfiles].fp = ftmp();
> +			fmerge(0, mstart, mfct, geteasy,
> +			    fstack[ntfiles].fp, putrec, ftbl);
> +			ntfiles++;
> +			mfct = 0;
> +
> +			if (!nodata) {
> +				memmove(crec->data, tmpbuf, sz);
> +				free(tmpbuf);
>  			}
>  		}
> -		get = getnext;
> -		if (!ntfiles && !mfct) {	/* everything in memory--pop */
> -			if (nelem > 1) {
> -				if (STABLE) {
> -					i = sradixsort((const u_char **)keylist,
> -					    nelem, weights, REC_D);
> -				} else {
> -					i = radixsort((const u_char **)keylist,
> -					    nelem, weights, REC_D);
> -				}
> -				if (i)
> -					err(2, NULL);
> -			}
> -			append(keylist, nelem, depth, outfp, putline, ftbl);
> -			break;					/* pop */
> -		}
> -		if (panic >= PANIC) {
> -			if (!ntfiles)
> -				fmerge(0, mstart, mfct, geteasy,
> -				    outfp, putline, ftbl);
> -			else
> -				fmerge(0, tfiles, ntfiles, geteasy,
> -				    outfp, putline, ftbl);
> -			break;
> -		}
> -		total = maxb = lastb = 0;	/* find if one bin dominates */
> -		for (i = 0; i < NBINS; i++)
> -		  if (sizes[i]) {
> -			if (sizes[i] > sizes[maxb])
> -				maxb = i;
> -			lastb = i;
> -			total += sizes[i];
> -		}
> -		if (sizes[maxb] < max((total / 2) , BUFSIZE))
> -			maxb = lastb;	/* otherwise pop after last bin */
> -		fstack[tfiles.top].lastb = lastb;
> -		fstack[tfiles.top].maxb = maxb;
> -
> -			/* start refining next level. */
> -		get(-1, tfiles, ntfiles, crec, bufend, 0);	/* rewind */
> -		for (i = 0; i < maxb; i++) {
> -			if (!sizes[i])	/* bin empty; step ahead file offset */
> -				get(i, tfiles, ntfiles, crec, bufend, 0);
> -			else
> -				fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
> -		}
> -		if (lastb != maxb) {
> -			if (prevfp != outfp)
> -				tailfp[panic] = prevfp;
> -			prevfp = ftmp();
> -			for (i = maxb+1; i <= lastb; i++)
> -				if (!sizes[i])
> -					get(i, tfiles, ntfiles, crec, bufend,0);
> -				else
> -					fsort(i, depth+1, tfiles, ntfiles,
> -					    prevfp, ftbl);
> -		}
> -
> -		/* sort biggest (or last) bin at this level */
> -		depth++;
> -		panic++;
> -		binno = maxb;
> -		infiles.top = tfiles.top;	/* getnext will free tfiles, */
> -		nfiles = ntfiles;		/* so overwrite them */
> -	}
> -	if (prevfp != outfp) {
> -		concat(outfp, prevfp);
> -		fclose(prevfp);
>  	}
> -	for (i = panic; i >= 0; --i)
> -		if (tailfp[i]) {
> -			concat(outfp, tailfp[i]);
> -			fclose(tailfp[i]);
> -		}
> -}
> -
> -/*
> - * This is one pass of radix exchange, dumping the bins to disk.
> - */
> -#define swap(a, b, t) t = a, a = b, b = t
> -void
> -onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
> -{
> -	size_t tsizes[NBINS+1];
> -	u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
> -	static int histo[256];
> -	int *hp;
> -	int c;
> -	u_char **an, *t, **aj;
> -	u_char **ak, *r;
> -
> -	memset(tsizes, 0, sizeof(tsizes));
> -	depth += sizeof(TRECHEADER);
> -	an = &a[n];
> -	for (ak = a; ak < an; ak++) {
> -		histo[c = tr[**ak]]++;
> -		tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
> +	if (!ntfiles && !mfct) {	/* everything in memory--pop */
> +		if (nelem > 1 && radix_sort((const u_char **)keylist,
> +		    nelem, weights, REC_D))
> +			err(2, NULL);
> +		if (nelem > 0)
> +			append(keylist, nelem, 0, outfp, putline, ftbl);
>  	}
> +	if (!ntfiles)
> +		fmerge(0, mstart, mfct, geteasy, outfp, putline, ftbl);
> +	else
> +		fmerge(0, tfiles, ntfiles, geteasy, outfp, putline,
> +		    ftbl);
>  
> -	bin[0] = a;
> -	bpmax = bin + 256;
> -	tp = top, hp = histo;
> -	for (bp = bin; bp < bpmax; bp++) {
> -		*tp++ = *(bp + 1) = *bp + (c = *hp);
> -		*hp++ = 0;
> -		if (c <= 1)
> -			continue;
> -	}
> -	for (aj = a; aj < an; *aj = r, aj = bin[c + 1]) 
> -		for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]); )
> -			swap(*ak, r, t);
> -
> -	for (ak = a, c = 0; c < 256; c++) {
> -		an = bin[c + 1];
> -		n = an - ak;
> -		tsizes[c] += n * sizeof(TRECHEADER);
> -		/* tell getnext how many elements in this bin, this segment. */
> -		EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
> -		sizes[c] += tsizes[c];
> -		for (; ak < an; ++ak)
> -			putrec((RECHEADER *) *ak, fp);
> -	}
> +	free(keylist);
> +	keylist = NULL;
> +	free(buffer);
> +	buffer = NULL;
>  }
> Index: fsort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/fsort.h,v
> retrieving revision 1.10
> diff -u -p -r1.10 fsort.h
> --- fsort.h	13 Mar 2007 17:33:58 -0000	1.10
> +++ fsort.h	29 Jun 2014 22:17:16 -0000
> @@ -38,9 +38,17 @@
>  #define BUFSIZE (1 << POW)
>  #define MAXNUM (BUFSIZE/10)	/* lowish guess at average record size */
>  #define BUFFEND (EOF-2)
> -#define BUFFSMALL (EOF-3)	/* buffer is too small to hold line */
>  #define MAXFCT 1000
>  #define MAXLLEN ((1 << min(POW-4, 16)) - 14)
> +#define MERGE_FNUM	16 
> +
> +/*
> + * Default (initial) and maximum size of record buffer for fsort().
> + * Note that no more than MAXNUM records are stored in the buffer,
> + * even if the buffer is not full yet.
> + */
> +#define DEFBUFSIZE      (1 << 20)       /* 1MB */
> +#define MAXBUFSIZE      (8 << 20)       /* 10 MB */
>  
>  extern u_char *linebuf;
>  extern size_t linebuf_size;
> Index: msort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/msort.c,v
> retrieving revision 1.24
> diff -u -p -r1.24 msort.c
> --- msort.c	13 Nov 2013 15:07:27 -0000	1.24
> +++ msort.c	29 Jun 2014 22:17:16 -0000
> @@ -65,6 +65,7 @@ fmerge(int binno, union f_handle files, 
>  	int i, j, last;
>  	void (*put)(RECHEADER *, FILE *);
>  	struct tempfile *l_fstack;
> +	size_t tbufsiz;
>  
>  	wts = ftbl->weights;
>  	if (!UNIQUE && SINGL_FLD && (ftbl->flags & F))
> @@ -72,12 +73,12 @@ fmerge(int binno, union f_handle files, 
>  	if (cfilebuf == NULL) {
>  		cfilebuf = malloc(MAXLLEN + sizeof(MFILE));
>  		if (cfilebuf == NULL)
> -			errx(2, "cannot allocate memory");
> +			errx(2, NULL);
>  	}
>  
> -	i = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> -	if (i > bufsize) {
> -		bufsize = i;
> +	tbufsiz = min(16, nfiles) * LALIGN(MAXLLEN + sizeof(MFILE));
> +	if (tbufsiz > bufsize) {
> +		bufsize = tbufsiz;
>  		if ((buffer = realloc(buffer, bufsize)) == NULL)
>  			err(2, NULL);
>  	}
> @@ -127,22 +128,54 @@ merge(int infl0, int nfiles,
>      int (*get)(int, union f_handle, int, RECHEADER *, u_char *, struct field *),
>      FILE *outfp, void (*put)(RECHEADER *, FILE *), struct field *ftbl)
>  {
> -	int c, i, j;
> +	int c, i, j, nf = nfiles;
>  	union f_handle dummy = {0};
> -	struct mfile *flist[16], *cfile;
> +	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
> +	size_t availsz = bufsize;
> +	static void *bufs[MERGE_FNUM+1];
> +	static size_t bufs_sz[MERGE_FNUM+1];
> +
> +	/*
> +	 * We need nfiles + 1 buffers. One is 'buffer', the
> +	 * rest needs to be allocated.
> +	 */
> +	bufs[0] = buffer;
> +	bufs_sz[0] = bufsize;
> +	for(i=1; i < nfiles+1; i++) {
> +		if (bufs[i])
> +			continue;
> +
> +		bufs[i] = malloc(MAXLLEN);
> +		if (!bufs[i])
> +			err(2, NULL);
> +		bufs_sz[i] = MAXLLEN;
> +	}
>  
>  	for (i = j = 0; i < nfiles; i++) {
> -		cfile = (MFILE *) (buffer +
> -		    i * LALIGN(MAXLLEN + sizeof(MFILE)));
> -		cfile->flno = j + infl0;
> -		cfile->end = cfile->rec->data + MAXLLEN;
> +		cfile = (struct mfile *) bufs[j];
> +		cfile->flno = infl0 + j;
> +		cfile->end = (u_char *) bufs[j] + bufs_sz[j];
>  		for (c = 1; c == 1;) {
> -			if (EOF == (c = get(j+infl0, dummy, nfiles,
> +			if (EOF == (c = get(cfile->flno, dummy, nfiles,
>  			   cfile->rec, cfile->end, ftbl))) {
>  				i--;
>  				nfiles--;
>  				break;
>  			}
> +
> +			if (c == BUFFEND) {
> +				cfile = realloc(bufs[j], bufs_sz[j] *= 2);
> +				bufs[j] = (void *)cfile;
> +
> +				if (!cfile)
> +					err(2, NULL);
> +
> +				cfile->end = (u_char *)cfile + bufs_sz[j];
> +
> +				c = 1;
> +				continue;
> +			}
> +
>  			if (i)
>  				c = insert(flist, &cfile, i, !DELETE);
>  			else
> @@ -151,24 +184,49 @@ merge(int infl0, int nfiles,
>  		j++;
>  	}
>  	if (nfiles > 0) {
> -		cfile = cfilebuf;
> +		cfile = (struct mfile *) bufs[nf];
>  		cfile->flno = flist[0]->flno;
> -		cfile->end = cfile->rec->data + MAXLLEN;
> +		cfile->end = (u_char *) cfile + bufs_sz[nf];
>  		while (nfiles) {
>  			for (c = 1; c == 1;) {
>  				if (EOF == (c = get(cfile->flno, dummy, nfiles,
>  				   cfile->rec, cfile->end, ftbl))) {
>  					put(flist[0]->rec, outfp);
> -					memmove(flist, flist + 1,
> -					    sizeof(MFILE *) * (--nfiles));
> -					cfile->flno = flist[0]->flno;
> +					if (--nfiles > 0) {
> +						flist++;
> +						cfile->flno = flist[0]->flno;
> +					}
>  					break;
>  				}
> +				if (c == BUFFEND) {
> +					char *oldbuf = (char *)cfile;
> +					availsz = (char *)cfile->end - oldbuf;
> +					availsz *= 2;
> +					cfile = realloc(oldbuf, availsz);
> +					if (!cfile)
> +						err(2, NULL);
> +
> +					for (i = 0; i < nf + 1; i++) {
> +						if (bufs[i] == oldbuf) {
> +							bufs[i] = (char *)cfile;
> +							bufs_sz[i] = availsz;
> +							break;
> +						}
> +					}
> +
> +					cfile->end = (u_char *)cfile + availsz;
> +					c = 1;
> +					continue;
> +				}
>  				if (!(c = insert(flist, &cfile, nfiles, DELETE)))
>  					put(cfile->rec, outfp);
>  			}
> -		}	
> -	}	
> +		}
> +		if (bufs_sz[0] > bufsize) {
> +			buffer = bufs[0];
> +			bufsize = bufs_sz[0];
> +		}
> +	}
>  }
>  
>  /*
> Index: sort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.c,v
> retrieving revision 1.41
> diff -u -p -r1.41 sort.c
> --- sort.c	13 Nov 2013 15:07:27 -0000	1.41
> +++ sort.c	29 Jun 2014 22:17:16 -0000
> @@ -44,13 +44,14 @@
>  
>  #include <sys/types.h>
>  #include <sys/stat.h>
> +
> +#include <err.h>
>  #include <locale.h>
>  #include <paths.h>
>  #include <signal.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <unistd.h>
> -#include <err.h>
>  
>  int REC_D = '\n';
>  u_char d_mask[NBINS];		/* flags for rec_d, field_d, <blank> */
> @@ -71,6 +72,8 @@ struct coldesc *clist;
>  int ncols = 0;
>  int ND = 10;			/* limit on number of -k options. */
>  
> +int (*radix_sort)(const u_char **, int, const u_char *, u_int) = radixsort;
> +
>  char *devstdin = _PATH_STDIN;
>  char *tmpdir = _PATH_VARTMP;
>  char toutpath[PATH_MAX];
> @@ -172,7 +175,6 @@ main(int argc, char *argv[])
>  			mflag = 1;
>  			break;
>  		case 'H':
> -			PANIC = 0;
>  			break;
>  		case 'y':
>  			/* accept -y for backwards compat. */
> @@ -186,6 +188,7 @@ main(int argc, char *argv[])
>  			break;
>  		case 's':
>  			STABLE = 1;
> +			radix_sort = sradixsort;
>  			break;
>  		case '?':
>  		default:
> @@ -292,8 +295,8 @@ main(int argc, char *argv[])
>  		act.sa_handler = onsig;
>  		for (i = 0; sigtable[i]; ++i)	/* always unlink toutpath */
>  			if (sigaction(sigtable[i], NULL, &oact) < 0 ||
> -			    oact.sa_handler != SIG_IGN &&
> -			    sigaction(sigtable[i], &act, NULL) < 0)
> +			    (oact.sa_handler != SIG_IGN &&
> +			    sigaction(sigtable[i], &act, NULL) < 0))
>  				err(2, "sigaction");
>  	} else
>  		outfile = outpath;
> @@ -302,7 +305,7 @@ main(int argc, char *argv[])
>  	if (mflag)
>  		fmerge(-1, filelist, argc-optind, get, outfp, putline, fldtab);
>  	else
> -		fsort(-1, 0, filelist, argc-optind, outfp, fldtab);
> +		fsort(filelist, argc-optind, outfp, fldtab);
>  	if (outfile != outpath) {
>  		if (access(outfile, 0))
>  			err(2, "%s", outfile);
> Index: sort.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.h,v
> retrieving revision 1.7
> diff -u -p -r1.7 sort.h
> --- sort.h	21 Aug 2007 20:29:25 -0000	1.7
> +++ sort.h	29 Jun 2014 22:17:16 -0000
> @@ -129,13 +129,14 @@ union f_handle {
>  	int top;
>  	char **names;
>  };
> -extern int PANIC;	/* maximum depth of fsort before fmerge is called */
> +
>  extern u_char ascii[NBINS], Rascii[NBINS], Ftable[NBINS], RFtable[NBINS];
>  extern u_char alltable[NBINS], dtable[NBINS], itable[NBINS];
>  extern u_char d_mask[NBINS];
>  extern int SINGL_FLD, SEP_FLAG, UNIQUE, STABLE;
>  extern int REC_D;
>  extern char *tmpdir;
> +extern int (*radix_sort)(const u_char **, int, const u_char *, u_int);
>  extern int ND;		/* limit on number of -k options. */
>  
>  #include "extern.h"


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic