[prev in list] [next in list] [prev in thread] [next in thread] 

List:       busybox-cvs
Subject:    [git commit] cut: add toybox-compatible options -O OUTSEP, -D, -F LIST
From:       Denys Vlasenko <vda.linux () googlemail ! com>
Date:       2021-07-20 14:29:22
Message-ID: 20210720135548.DEECC884A6 () busybox ! osuosl ! org
[Download RAW message or body]

commit: https://git.busybox.net/busybox/commit/?id=0068ce2fa0e3dbc618b8ca87ab56e144731da784
                
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

function                                             old     new   delta
cut_main                                             884    1201    +317
packed_usage                                       33823   33885     +62
.rodata                                           104186  104179      -7
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 2/1 up/down: 379/-7)            Total: 372 bytes

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/cut.c     | 207 ++++++++++++++++++++++++++++------------------------
 testsuite/cut.tests |  64 ++++++++++++++++
 2 files changed, 174 insertions(+), 97 deletions(-)

diff --git a/coreutils/cut.c b/coreutils/cut.c
index cc3c32576..7009e74cf 100644
--- a/coreutils/cut.c
+++ b/coreutils/cut.c
@@ -14,6 +14,13 @@
 //config:	help
 //config:	cut is used to print selected parts of lines from
 //config:	each file to stdout.
+//config:
+//config:config FEATURE_CUT_REGEX
+//config:	bool "cut -F"
+//config:	default y
+//config:	depends on CUT
+//config:	help
+//config:	Allow regex based delimiters.
 
 //applet:IF_CUT(APPLET_NOEXEC(cut, cut, BB_DIR_USR_BIN, BB_SUID_DROP, cut))
 
@@ -25,9 +32,14 @@
 //usage:       "Print selected fields from FILEs to stdout\n"
 //usage:     "\n	-b LIST	Output only bytes from LIST"
 //usage:     "\n	-c LIST	Output only characters from LIST"
-//usage:     "\n	-d CHAR	Use CHAR instead of tab as field delimiter"
+//usage:     "\n	-d SEP	Field delimiter for input (default -f TAB, -F run of \
whitespace)" +//usage:     "\n	-O SEP	Field delimeter for output (default = -d for \
-f, one space for -F)" +//usage:     "\n	-D	Don't sort/collate sections or match -fF \
lines without delimeter" +//usage:     "\n	-f LIST	Print only these fields (-d is \
single char)" +//usage:     IF_FEATURE_CUT_REGEX(
+//usage:     "\n	-F LIST	Print only these fields (-d is regex)"
+//usage:     )
 //usage:     "\n	-s	Output only lines containing delimiter"
-//usage:     "\n	-f LIST	Print only these fields"
 //usage:     "\n	-n	Ignored"
 //(manpage:-n	with -b: don't split multibyte characters)
 //usage:
@@ -39,38 +51,49 @@
 
 #include "libbb.h"
 
+#if ENABLE_FEATURE_CUT_REGEX
+#include "xregex.h"
+#else
+#define regex_t int
+typedef struct { int rm_eo, rm_so; } regmatch_t;
+#define xregcomp(x, ...) *(x) = 0
+#define regexec(...)     0
+#endif
+
 /* This is a NOEXEC applet. Be very careful! */
 
 
 /* option vars */
-#define OPT_STR "b:c:f:d:sn"
+#define OPT_STR "b:c:f:d:O:sD"IF_FEATURE_CUT_REGEX("F:")"n"
 #define CUT_OPT_BYTE_FLGS     (1 << 0)
 #define CUT_OPT_CHAR_FLGS     (1 << 1)
 #define CUT_OPT_FIELDS_FLGS   (1 << 2)
 #define CUT_OPT_DELIM_FLGS    (1 << 3)
-#define CUT_OPT_SUPPRESS_FLGS (1 << 4)
+#define CUT_OPT_ODELIM_FLGS   (1 << 4)
+#define CUT_OPT_SUPPRESS_FLGS (1 << 5)
+#define CUT_OPT_NOSORT_FLGS   (1 << 6)
+#define CUT_OPT_REGEX_FLGS    ((1 << 7) * ENABLE_FEATURE_CUT_REGEX)
 
 struct cut_list {
 	int startpos;
 	int endpos;
 };
 
-enum {
-	BOL = 0,
-	EOL = INT_MAX,
-	NON_RANGE = -1
-};
-
 static int cmpfunc(const void *a, const void *b)
 {
 	return (((struct cut_list *) a)->startpos -
 			((struct cut_list *) b)->startpos);
 }
 
-static void cut_file(FILE *file, char delim, const struct cut_list *cut_lists, \
unsigned nlists) +static void cut_file(FILE *file, const char *delim, const char \
*odelim, +		const struct cut_list *cut_lists, unsigned nlists)
 {
 	char *line;
 	unsigned linenum = 0;	/* keep these zero-based to be consistent */
+	regex_t reg;
+	int spos, shoe = option_mask32 & CUT_OPT_REGEX_FLGS;
+
+	if (shoe) xregcomp(&reg, delim, REG_EXTENDED);
 
 	/* go through every line in the file */
 	while ((line = xmalloc_fgetline(file)) != NULL) {
@@ -80,29 +103,22 @@ static void cut_file(FILE *file, char delim, const struct \
cut_list *cut_lists, u  char *printed = xzalloc(linelen + 1);
 		char *orig_line = line;
 		unsigned cl_pos = 0;
-		int spos;
 
 		/* cut based on chars/bytes XXX: only works when sizeof(char) == byte */
 		if (option_mask32 & (CUT_OPT_CHAR_FLGS | CUT_OPT_BYTE_FLGS)) {
 			/* print the chars specified in each cut list */
 			for (; cl_pos < nlists; cl_pos++) {
-				spos = cut_lists[cl_pos].startpos;
-				while (spos < linelen) {
+				for (spos = cut_lists[cl_pos].startpos; spos < linelen;) {
 					if (!printed[spos]) {
 						printed[spos] = 'X';
 						putchar(line[spos]);
 					}
-					spos++;
-					if (spos > cut_lists[cl_pos].endpos
-					/* NON_RANGE is -1, so if below is true,
-					 * the above was true too (spos is >= 0) */
-					/* || cut_lists[cl_pos].endpos == NON_RANGE */
-					) {
+					if (++spos > cut_lists[cl_pos].endpos) {
 						break;
 					}
 				}
 			}
-		} else if (delim == '\n') {	/* cut by lines */
+		} else if (*delim == '\n') {	/* cut by lines */
 			spos = cut_lists[cl_pos].startpos;
 
 			/* get out if we have no more lists to process or if the lines
@@ -115,9 +131,7 @@ static void cut_file(FILE *file, char delim, const struct \
cut_list *cut_lists, u  while (spos < (int)linenum) {
 				spos++;
 				/* go to the next list if we're at the end of this one */
-				if (spos > cut_lists[cl_pos].endpos
-				 || cut_lists[cl_pos].endpos == NON_RANGE
-				) {
+				if (spos > cut_lists[cl_pos].endpos) {
 					cl_pos++;
 					/* get out if there's no more lists to process */
 					if (cl_pos >= nlists)
@@ -135,55 +149,56 @@ static void cut_file(FILE *file, char delim, const struct \
cut_list *cut_lists, u  puts(line);
 			goto next_line;
 		} else {		/* cut by fields */
-			int ndelim = -1;	/* zero-based / one-based problem */
-			int nfields_printed = 0;
-			char *field = NULL;
-			char delimiter[2];
-
-			delimiter[0] = delim;
-			delimiter[1] = 0;
-
-			/* does this line contain any delimiters? */
-			if (strchr(line, delim) == NULL) {
-				if (!(option_mask32 & CUT_OPT_SUPPRESS_FLGS))
-					puts(line);
-				goto next_line;
-			}
-
-			/* process each list on this line, for as long as we've got
-			 * a line to process */
-			for (; cl_pos < nlists && line; cl_pos++) {
-				spos = cut_lists[cl_pos].startpos;
-				do {
-					/* find the field we're looking for */
-					while (line && ndelim < spos) {
-						field = strsep(&line, delimiter);
-						ndelim++;
-					}
-
-					/* we found it, and it hasn't been printed yet */
-					if (field && ndelim == spos && !printed[ndelim]) {
-						/* if this isn't our first time through, we need to
-						 * print the delimiter after the last field that was
-						 * printed */
-						if (nfields_printed > 0)
-							putchar(delim);
-						fputs_stdout(field);
-						printed[ndelim] = 'X';
-						nfields_printed++;	/* shouldn't overflow.. */
+			unsigned uu = 0, start = 0, end = 0, out = 0;
+			int dcount = 0;
+
+			/* Loop through bytes, finding next delimiter */
+			for (;;) {
+				/* End of current range? */
+				if (end == linelen || dcount > cut_lists[cl_pos].endpos) {
+					if (++cl_pos >= nlists) break;
+					if (option_mask32 & CUT_OPT_NOSORT_FLGS)
+						start = dcount = uu = 0;
+					end = 0;
+				}
+				/* End of current line? */
+				if (uu == linelen) {
+					/* If we've seen no delimiters, check -s */
+					if (!cl_pos && !dcount && !shoe) {
+						if (option_mask32 & CUT_OPT_SUPPRESS_FLGS)
+							goto next_line;
+					} else if (dcount<cut_lists[cl_pos].startpos)
+						start = linelen;
+					end = linelen;
+				} else {
+					/* Find next delimiter */
+					if (shoe) {
+						regmatch_t rr = {-1, -1};
+
+						if (!regexec(&reg, line+uu, 1, &rr, REG_NOTBOL|REG_NOTEOL)) {
+							end = uu + rr.rm_so;
+							uu += rr.rm_eo;
+						} else {
+							uu = linelen;
+							continue;
+						}
+					} else if (line[end = uu++] != *delim)
+						continue;
+
+					/* Got delimiter. Loop if not yet within range. */
+					if (dcount++ < cut_lists[cl_pos].startpos) {
+						start = uu;
+						continue;
 					}
-
-					spos++;
-
-					/* keep going as long as we have a line to work with,
-					 * this is a list, and we're not at the end of that
-					 * list */
-				} while (spos <= cut_lists[cl_pos].endpos && line
-						&& cut_lists[cl_pos].endpos != NON_RANGE);
+				}
+				if (end != start || !shoe)
+					printf("%s%.*s", out++ ? odelim : "", end-start, line + start);
+				start = uu;
+				if (!dcount)
+					break;
 			}
 		}
-		/* if we printed anything at all, we need to finish it with a
-		 * newline cuz we were handed a chomped line */
+		/* if we printed anything, finish with newline */
 		putchar('\n');
  next_line:
 		linenum++;
@@ -198,37 +213,35 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 	/* growable array holding a series of lists */
 	struct cut_list *cut_lists = NULL;
 	unsigned nlists = 0;	/* number of elements in above list */
-	char delim = '\t';	/* delimiter, default is tab */
 	char *sopt, *ltok;
+	const char *delim = NULL;
+	const char *odelim = NULL;
 	unsigned opt;
 
+#define ARG "bcf"IF_FEATURE_CUT_REGEX("F")
 	opt = getopt32(argv, "^"
-			OPT_STR
-			"\0" "b--bcf:c--bcf:f--bcf",
-			&sopt, &sopt, &sopt, &ltok
+			OPT_STR  // = "b:c:f:d:O:sD"IF_FEATURE_CUT_REGEX("F:")"n"
+			"\0" "b--"ARG":c--"ARG":f--"ARG IF_FEATURE_CUT_REGEX("F--"ARG),
+			&sopt, &sopt, &sopt, &delim, &odelim IF_FEATURE_CUT_REGEX(, &sopt)
 	);
+	if (!delim || !*delim)
+		delim = (opt & CUT_OPT_REGEX_FLGS) ? "[[:space:]]+" : "\t";
+	if (!odelim) odelim = (opt & CUT_OPT_REGEX_FLGS) ? " " : delim;
+
 //	argc -= optind;
 	argv += optind;
-	if (!(opt & (CUT_OPT_BYTE_FLGS | CUT_OPT_CHAR_FLGS | CUT_OPT_FIELDS_FLGS)))
+	if (!(opt & (CUT_OPT_BYTE_FLGS | CUT_OPT_CHAR_FLGS | CUT_OPT_FIELDS_FLGS | \
CUT_OPT_REGEX_FLGS)))  bb_simple_error_msg_and_die("expected a list of bytes, \
characters, or fields");  
-	if (opt & CUT_OPT_DELIM_FLGS) {
-		if (ltok[0] && ltok[1]) { /* more than 1 char? */
-			bb_simple_error_msg_and_die("the delimiter must be a single character");
-		}
-		delim = ltok[0];
-	}
-
 	/*  non-field (char or byte) cutting has some special handling */
-	if (!(opt & CUT_OPT_FIELDS_FLGS)) {
+	if (!(opt & (CUT_OPT_FIELDS_FLGS|CUT_OPT_REGEX_FLGS))) {
 		static const char _op_on_field[] ALIGN1 = " only when operating on fields";
 
 		if (opt & CUT_OPT_SUPPRESS_FLGS) {
 			bb_error_msg_and_die
-				("suppressing non-delimited lines makes sense%s",
-				_op_on_field);
+				("suppressing non-delimited lines makes sense%s", _op_on_field);
 		}
-		if (delim != '\t') {
+		if (opt & CUT_OPT_DELIM_FLGS) {
 			bb_error_msg_and_die
 				("a delimiter may be specified%s", _op_on_field);
 		}
@@ -253,7 +266,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 			/* get the start pos */
 			ntok = strsep(&ltok, "-");
 			if (!ntok[0]) {
-				s = BOL;
+				s = 0;
 			} else {
 				s = xatoi_positive(ntok);
 				/* account for the fact that arrays are zero based, while
@@ -264,24 +277,23 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 
 			/* get the end pos */
 			if (ltok == NULL) {
-				e = NON_RANGE;
+				e = s;
 			} else if (!ltok[0]) {
-				e = EOL;
+				e = INT_MAX;
 			} else {
 				e = xatoi_positive(ltok);
 				/* if the user specified and end position of 0,
 				 * that means "til the end of the line" */
-				if (e == 0)
-					e = EOL;
+				if (!*ltok)
+					e = INT_MAX;
+				else if (e < s)
+					bb_error_msg_and_die("%d<%d", e, s);
 				e--;	/* again, arrays are zero based, lines are 1 based */
-				if (e == s)
-					e = NON_RANGE;
 			}
 
 			/* add the new list */
 			cut_lists = xrealloc_vector(cut_lists, 4, nlists);
-			/* NB: startpos is always >= 0,
-			 * while endpos may be = NON_RANGE (-1) */
+			/* NB: startpos is always >= 0 */
 			cut_lists[nlists].startpos = s;
 			cut_lists[nlists].endpos = e;
 			nlists++;
@@ -294,7 +306,8 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 		/* now that the lists are parsed, we need to sort them to make life
 		 * easier on us when it comes time to print the chars / fields / lines
 		 */
-		qsort(cut_lists, nlists, sizeof(cut_lists[0]), cmpfunc);
+		if (!(opt & CUT_OPT_NOSORT_FLGS))
+			qsort(cut_lists, nlists, sizeof(cut_lists[0]), cmpfunc);
 	}
 
 	{
@@ -309,7 +322,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 				retval = EXIT_FAILURE;
 				continue;
 			}
-			cut_file(file, delim, cut_lists, nlists);
+			cut_file(file, delim, odelim, cut_lists, nlists);
 			fclose_if_not_stdin(file);
 		} while (*++argv);
 
diff --git a/testsuite/cut.tests b/testsuite/cut.tests
index 110340277..d705b91c3 100755
--- a/testsuite/cut.tests
+++ b/testsuite/cut.tests
@@ -15,4 +15,68 @@ testing "cut '-' (stdin) and multi file handling" \
 	"the quick brown fox\n" \
 	"jumps over the lazy dog\n" \
 
+abc="\
+one:two:three:four:five:six:seven
+alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
+the quick brown fox jumps over the lazy dog
+"
+
+testing "cut -b a,a,a" "cut -b 3,3,3 input" "e\np\ne\n" "$abc" ""
+
+testing "cut -b overlaps" "cut -b 1-3,2-5,7-9,9-10 input" \
+  "one:to:th\nalphabeta\nthe qick \n" "$abc" ""
+testing "-b encapsulated" "cut -b 3-8,4-6 input" "e:two:\npha:be\ne quic\n" \
+  "$abc" ""
+# --output-delimiter not implemnted (yet?)
+#testing "cut -bO overlaps" \
+#  "cut --output-delimiter ' ' -b 1-3,2-5,7-9,9-10 input" \
+#  "one:t o:th\nalpha beta\nthe q ick \n" "$abc" ""
+testing "cut high-low error" "cut -b 8-3 abc.txt 2>/dev/null || echo err" "err\n" \
+  "$abc" ""
+
+testing "cut -c a-b" "cut -c 4-10 input" ":two:th\nha:beta\n quick \n" "$abc" ""
+testing "cut -c a-" "cut -c 41- input" "\ntheta:iota:kappa:lambda:mu\ndog\n" "$abc" \
"" +testing "cut -c -b" "cut -c -39 input" \
+  "one:two:three:four:five:six:seven\nalpha:beta:gamma:delta:epsilon:zeta:eta\nthe \
quick brown fox jumps over the lazy\n" \ +  "$abc" ""
+testing "cut -c a" "cut -c 40 input" "\n:\n \n" "$abc" ""
+testing "cut -c a,b-c,d" "cut -c 3,5-7,10 input" "etwoh\npa:ba\nequi \n" "$abc" ""
+
+testing "cut -f a-" "cut -d ':' -f 5- input" \
"five:six:seven\nepsilon:zeta:eta:theta:iota:kappa:lambda:mu\nthe quick brown fox \
jumps over the lazy dog\n" "$abc" "" +
+testing "cut show whole line with no delim" "cut -d ' ' -f 3 input" \
+	"one:two:three:four:five:six:seven\nalpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu\nbrown\n" \
"$abc" "" +
+testing "cut with echo, -c (a-b)" "echo 'ref_categorie=test' | cut -c 1-15 " \
"ref_categorie=t\n" "" "" +testing "cut with echo, -c (a)" "echo 'ref_categorie=test' \
| cut -c 14" "=\n" "" "" +
+testing "cut with -c (a,b,c)" "cut -c 4,5,20 input" "det\n" \
"abcdefghijklmnopqrstuvwxyz" "" +
+testing "cut with -b (a,b,c)" "cut -b 4,5,20 input" "det\n" \
"abcdefghijklmnopqrstuvwxyz" "" +
+input="\
+406378:Sales:Itorre:Jan
+031762:Marketing:Nasium:Jim
+636496:Research:Ancholie:Mel
+396082:Sales:Jucacion:Ed
+"
+testing "cut with -d -f(:) -s" "cut -d: -f3 -s input" \
"Itorre\nNasium\nAncholie\nJucacion\n" "$input" "" +testing "cut with -d -f( ) -s" \
"cut -d' ' -f3 -s input && echo yes" "yes\n" "$input" "" +testing "cut with -d -f(a) \
-s" "cut -da -f3 -s input" "n\nsium:Jim\n\ncion:Ed\n" "$input" "" +testing "cut with \
-d -f(a) -s -n" "cut -da -f3 -s -n input" "n\nsium:Jim\n\ncion:Ed\n" "$input" "" +
+# substitute for awk
+testing "cut -DF" "cut -DF 2,7,5" \
+  "said and your\nare\nis demand. supply\nforecast :\nyou you better,\n\nEm: Took \
hate\n" "" \ +"Bother, said Pooh. It's your husband, and he has a gun.
+Cheerios are donut seeds.
+Talk is cheap because supply exceeds demand.
+Weather forecast for tonight : dark.
+Apple: you can buy better, but you can't pay more.
+Subcalifragilisticexpialidocious.
+Auntie Em: Hate you, hate Kansas. Took the dog. Dorothy."
+
+testing "cut empty field" "cut -d ':' -f 1-3" "a::b\n" "" "a::b\n"
+testing "cut empty field 2" "cut -d ':' -f 3-5" "b::c\n" "" "a::b::c:d\n"
+
 exit $FAILCOUNT
_______________________________________________
busybox-cvs mailing list
busybox-cvs@busybox.net
http://lists.busybox.net/mailman/listinfo/busybox-cvs


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic