thr3ads.net - klibc - [klibc] [klibc:update-dash] dash: parser: Add syntax stack for recursive parsing [Mar 2020]

If this information is useful, please help other people find it:
Share via:
klibc-bot for Herbert Xu
2020-Mar-28 21:49 UTC
[klibc] [klibc:update-dash] dash: parser: Add syntax stack for recursive parsing

Commit-ID:  ef6ed3c138aaf680ff3e4e0d7fbad6d2c61fb7d0
Gitweb:    
http://git.kernel.org/?p=libs/klibc/klibc.git;a=commit;h=ef6ed3c138aaf680ff3e4e0d7fbad6d2c61fb7d0
Author:     Herbert Xu <herbert at gondor.apana.org.au>
AuthorDate: Fri, 9 Mar 2018 00:14:02 +0800
Committer:  Ben Hutchings <ben at decadent.org.uk>
CommitDate: Sat, 28 Mar 2020 21:42:54 +0000

[klibc] dash: parser: Add syntax stack for recursive parsing

[ dash commit ab1cecb4047864afb247a6ed691e7f59ce716f2c ]

Without a stack of syntaxes we cannot correctly these two cases
together:

        "${a#'$$'}"
        "${a#"${b-'$$'}"}"

A recursive parser also helps in some other corner cases such
as nested arithmetic expansion with paratheses.

This patch adds a syntax stack allocated from the stack using
alloca.  As a side-effect this allows us to remove the naked
backslashes for patterns within double-quotes, which means that
EXP_QPAT also has to go.

This patch also fixes removes any backslashes that precede right
braces when they are present within a parameter expansion context,
and backslashes that precede double quotes within inner double
quotes inside a parameter expansion in a here-document context.

The idea of a recursive parser is based on a patch by Harald van
Dijk.

Signed-off-by: Herbert Xu <herbert at gondor.apana.org.au>
Signed-off-by: Ben Hutchings <ben at decadent.org.uk>

---
 usr/dash/expand.c |  27 +++------
 usr/dash/expand.h |   1 -
 usr/dash/parser.c | 177 ++++++++++++++++++++++++++++++++++--------------------
 3 files changed, 120 insertions(+), 85 deletions(-)

diff --git a/usr/dash/expand.c b/usr/dash/expand.c
index e86bd29d..2c29ea44 100644
--- a/usr/dash/expand.c
+++ b/usr/dash/expand.c
@@ -85,7 +85,7 @@
 #define RMESCAPE_HEAP	0x10	/* Malloc strings instead of stalloc */
 
 /* Add CTLESC when necessary. */
-#define QUOTES_ESC	(EXP_FULL | EXP_CASE | EXP_QPAT)
+#define QUOTES_ESC	(EXP_FULL | EXP_CASE)
 /* Do not skip NUL characters. */
 #define QUOTES_KEEPNUL	EXP_TILDE
 
@@ -335,16 +335,6 @@ addquote:
 		case CTLESC:
 			startloc++;
 			length++;
-
-			/*
-			 * Quoted parameter expansion pattern: remove quote
-			 * unless inside inner quotes or we have a literal
-			 * backslash.
-			 */
-			if (((flag | inquotes) & (EXP_QPAT | EXP_QUOTED)) =-			    EXP_QPAT
&& *p != '\\')
-				break;
-
 			goto addquote;
 		case CTLVAR:
 			p = evalvar(p, flag | inquotes);
@@ -653,8 +643,7 @@ subevalvar(char *p, char *str, int strloc, int subtype, int
startloc, int varfla
 	char *(*scan)(char *, char *, char *, char *, int , int);
 
 	argstr(p, EXP_TILDE | (subtype != VSASSIGN && subtype != VSQUESTION ?
-			       (flag & (EXP_QUOTED | EXP_QPAT) ?
-			        EXP_QPAT : EXP_CASE) : 0));
+			       EXP_CASE : 0));
 	STPUTC('\0', expdest);
 	argbackq = saveargbackq;
 	startp = stackblock() + startloc;
@@ -1646,7 +1635,6 @@ char *
 _rmescapes(char *str, int flag)
 {
 	char *p, *q, *r;
-	unsigned inquotes;
 	int notescaped;
 	int globbing;
 
@@ -1676,24 +1664,23 @@ _rmescapes(char *str, int flag)
 			q = mempcpy(q, str, len);
 		}
 	}
-	inquotes = 0;
 	globbing = flag & RMESCAPE_GLOB;
 	notescaped = globbing;
 	while (*p) {
 		if (*p == (char)CTLQUOTEMARK) {
-			inquotes = ~inquotes;
 			p++;
 			notescaped = globbing;
 			continue;
 		}
+		if (*p == '\\') {
+			/* naked back slash */
+			notescaped = 0;
+			goto copy;
+		}
 		if (*p == (char)CTLESC) {
 			p++;
 			if (notescaped)
 				*q++ = '\\';
-		} else if (*p == '\\' && !inquotes) {
-			/* naked back slash */
-			notescaped = 0;
-			goto copy;
 		}
 		notescaped = globbing;
 copy:
diff --git a/usr/dash/expand.h b/usr/dash/expand.h
index e2be8aa4..5c767e66 100644
--- a/usr/dash/expand.h
+++ b/usr/dash/expand.h
@@ -58,7 +58,6 @@ struct arglist {
 #define	EXP_VARTILDE	0x4	/* expand tildes in an assignment */
 #define	EXP_REDIR	0x8	/* file glob for a redirection (1 match only) */
 #define EXP_CASE	0x10	/* keeps quotes around for CASE pattern */
-#define EXP_QPAT	0x20	/* pattern in quoted parameter expansion */
 #define EXP_VARTILDE2	0x40	/* expand tildes after colons only */
 #define EXP_WORD	0x80	/* expand word in parameter expansion */
 #define EXP_QUOTED	0x100	/* expand word in double quotes */
diff --git a/usr/dash/parser.c b/usr/dash/parser.c
index 8b945e36..c28363ca 100644
--- a/usr/dash/parser.c
+++ b/usr/dash/parser.c
@@ -80,6 +80,18 @@ struct heredoc {
 	int striptabs;		/* if set, strip leading tabs */
 };
 
+struct synstack {
+	const char *syntax;
+	struct synstack *prev;
+	struct synstack *next;
+	int innerdq;
+	int varpushed;
+	int dblquote;
+	int varnest;		/* levels of variables expansion */
+	int parenlevel;		/* levels of parens in arithmetic */
+	int dqvarnest;		/* levels of variables expansion within double quotes */
+};
+
 
 
 struct heredoc *heredoclist;	/* list of here documents to read */
@@ -841,6 +853,21 @@ static int pgetc_eatbnl(void)
 	return c;
 }
 
+static void synstack_push(struct synstack **stack, struct synstack *next,
+			  const char *syntax)
+{
+	memset(next, 0, sizeof(*next));
+	next->syntax = syntax;
+	next->next = *stack;
+	(*stack)->prev = next;
+	*stack = next;
+}
+
+static void synstack_pop(struct synstack **stack)
+{
+	*stack = (*stack)->next;
+}
+
 
 
 /*
@@ -870,24 +897,15 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 	size_t len;
 	struct nodelist *bqlist;
 	int quotef;
-	int dblquote;
-	int varnest;	/* levels of variables expansion */
-	int arinest;	/* levels of arithmetic expansion */
-	int parenlevel;	/* levels of parens in arithmetic */
-	int dqvarnest;	/* levels of variables expansion within double quotes */
 	int oldstyle;
-	/* syntax before arithmetic */
-	char const *uninitialized_var(prevsyntax);
+	/* syntax stack */
+	struct synstack synbase = { .syntax = syntax };
+	struct synstack *synstack = &synbase;
 
-	dblquote = 0;
 	if (syntax == DQSYNTAX)
-		dblquote = 1;
+		synstack->dblquote = 1;
 	quotef = 0;
 	bqlist = NULL;
-	varnest = 0;
-	arinest = 0;
-	parenlevel = 0;
-	dqvarnest = 0;
 
 	STARTSTACKSTR(out);
 	loop: {	/* for each line, until end of word */
@@ -895,7 +913,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 		if (c == '\034' && doprompt
 		 && attyset() && ! equal(termval(), "emacs")) {
 			attyline();
-			if (syntax == BASESYNTAX)
+			if (synstack->syntax == BASESYNTAX)
 				return readtoken();
 			c = syntax == SQSYNTAX ? pgetc() : pgetc_eatbnl();
 			goto loop;
@@ -904,9 +922,9 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 		CHECKEND();	/* set c to PEOF if at end of here document */
 		for (;;) {	/* until end of line or end of word */
 			CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
-			switch(syntax[c]) {
+			switch(synstack->syntax[c]) {
 			case CNL:	/* '\n' */
-				if (syntax == BASESYNTAX)
+				if (synstack->syntax == BASESYNTAX)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 				nlprompt();
@@ -916,7 +934,7 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 				USTPUTC(c, out);
 				break;
 			case CCTL:
-				if (eofmark == NULL || dblquote)
+				if (eofmark == NULL || synstack->dblquote)
 					USTPUTC(CTLESC, out);
 				USTPUTC(c, out);
 				break;
@@ -929,13 +947,18 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 					pungetc();
 				} else {
 					if (
-						dblquote &&
+						synstack->dblquote &&
 						c != '\\' && c != '`' &&
 						c != '$' && (
 							c != '"' ||
-							eofmark != NULL
+							(eofmark != NULL &&
+							 !synstack->varnest)
+						) && (
+							c != '}' ||
+							!synstack->varnest
 						)
 					) {
+						USTPUTC(CTLESC, out);
 						USTPUTC('\\', out);
 					}
 					USTPUTC(CTLESC, out);
@@ -944,55 +967,64 @@ readtoken1(int firstc, char const *syntax, char *eofmark,
int striptabs)
 				}
 				break;
 			case CSQUOTE:
-				syntax = SQSYNTAX;
+				synstack->syntax = SQSYNTAX;
 quotemark:
 				if (eofmark == NULL) {
 					USTPUTC(CTLQUOTEMARK, out);
 				}
 				break;
 			case CDQUOTE:
-				syntax = DQSYNTAX;
-				dblquote = 1;
+				synstack->syntax = DQSYNTAX;
+				synstack->dblquote = 1;
+toggledq:
+				if (synstack->varnest)
+					synstack->innerdq ^= 1;
 				goto quotemark;
 			case CENDQUOTE:
-				if (eofmark && !varnest)
+				if (eofmark && !synstack->varnest) {
 					USTPUTC(c, out);
-				else {
-					if (dqvarnest == 0) {
-						syntax = BASESYNTAX;
-						dblquote = 0;
-					}
-					quotef++;
-					goto quotemark;
+					break;
 				}
-				break;
+
+				if (synstack->dqvarnest == 0) {
+					synstack->syntax = BASESYNTAX;
+					synstack->dblquote = 0;
+				}
+
+				quotef++;
+
+				if (c == '"')
+					goto toggledq;
+
+				goto quotemark;
 			case CVAR:	/* '$' */
 				PARSESUB();		/* parse substitution */
 				break;
 			case CENDVAR:	/* '}' */
-				if (varnest > 0) {
-					varnest--;
-					if (dqvarnest > 0) {
-						dqvarnest--;
-					}
+				if (!synstack->innerdq &&
+				    synstack->varnest > 0) {
+					if (!--synstack->varnest &&
+					    synstack->varpushed)
+						synstack_pop(&synstack);
+					else if (synstack->dqvarnest > 0)
+						synstack->dqvarnest--;
 					USTPUTC(CTLENDVAR, out);
 				} else {
 					USTPUTC(c, out);
 				}
 				break;
 			case CLP:	/* '(' in arithmetic */
-				parenlevel++;
+				synstack->parenlevel++;
 				USTPUTC(c, out);
 				break;
 			case CRP:	/* ')' in arithmetic */
-				if (parenlevel > 0) {
+				if (synstack->parenlevel > 0) {
 					USTPUTC(c, out);
-					--parenlevel;
+					--synstack->parenlevel;
 				} else {
 					if (pgetc_eatbnl() == ')') {
 						USTPUTC(CTLENDARI, out);
-						if (!--arinest)
-							syntax = prevsyntax;
+						synstack_pop(&synstack);
 					} else {
 						/*
 						 * unbalanced parens
@@ -1011,7 +1043,7 @@ quotemark:
 			case CIGN:
 				break;
 			default:
-				if (varnest == 0)
+				if (synstack->varnest == 0)
 					goto endword;	/* exit outer loop */
 				if (c != PEOA) {
 					USTPUTC(c, out);
@@ -1021,11 +1053,11 @@ quotemark:
 		}
 	}
 endword:
-	if (syntax == ARISYNTAX)
+	if (synstack->syntax == ARISYNTAX)
 		synerror("Missing '))'");
-	if (syntax != BASESYNTAX && eofmark == NULL)
+	if (synstack->syntax != BASESYNTAX && eofmark == NULL)
 		synerror("Unterminated quoted string");
-	if (varnest != 0) {
+	if (synstack->varnest != 0) {
 		/* { */
 		synerror("Missing '}'");
 	}
@@ -1202,6 +1234,8 @@ parsesub: {
 			PARSEBACKQNEW();
 		}
 	} else {
+		const char *newsyn = synstack->syntax;
+
 		USTPUTC(CTLVAR, out);
 		typeloc = out - (char *)stackblock();
 		STADJUST(1, out);
@@ -1252,6 +1286,8 @@ varname:
 		}
 
 		if (subtype == 0) {
+			int cc = c;
+
 			switch (c) {
 			case ':':
 				subtype = VSNUL;
@@ -1265,27 +1301,40 @@ varname:
 				break;
 			case '%':
 			case '#':
-				{
-					int cc = c;
-					subtype = c == '#' ? VSTRIMLEFT :
-							     VSTRIMRIGHT;
-					c = pgetc_eatbnl();
-					if (c == cc)
-						subtype++;
-					else
-						pungetc();
-					break;
-				}
+				subtype = c == '#' ? VSTRIMLEFT :
+						     VSTRIMRIGHT;
+				c = pgetc_eatbnl();
+				if (c == cc)
+					subtype++;
+				else
+					pungetc();
+
+				newsyn = BASESYNTAX;
+				break;
 			}
 		} else {
 badsub:
 			pungetc();
 		}
+
+		if (newsyn == ARISYNTAX && subtype > VSNORMAL)
+			newsyn = DQSYNTAX;
+
+		if (newsyn != synstack->syntax) {
+			synstack_push(&synstack,
+				      synstack->prev ?:
+				      alloca(sizeof(*synstack)),
+				      newsyn);
+
+			synstack->varpushed++;
+			synstack->dblquote = newsyn != BASESYNTAX;
+		}
+
 		*((char *)stackblock() + typeloc) = subtype;
 		if (subtype != VSNORMAL) {
-			varnest++;
-			if (dblquote)
-				dqvarnest++;
+			synstack->varnest++;
+			if (synstack->dblquote)
+				synstack->dqvarnest++;
 		}
 		STPUTC('=', out);
 	}
@@ -1335,7 +1384,7 @@ parsebackq: {
 			case '\\':
                                 pc = pgetc_eatbnl();
                                 if (pc != '\\' && pc !=
'`' && pc != '$'
-                                    && (!dblquote || pc !=
'"'))
+                                    && (!synstack->dblquote || pc !=
'"'))
                                         STPUTC('\\', pout);
 				if (pc > PEOA) {
 					break;
@@ -1411,10 +1460,10 @@ done:
  */
 parsearith: {
 
-	if (++arinest == 1) {
-		prevsyntax = syntax;
-		syntax = ARISYNTAX;
-	}
+	synstack_push(&synstack,
+		      synstack->prev ?: alloca(sizeof(*synstack)),
+		      ARISYNTAX);
+	synstack->dblquote = 1;
 	USTPUTC(CTLARI, out);
 	goto parsearith_return;
 }
Maybe Matching Threads

Search for more apparently analagous threads
klibc - Mar 2020 - [klibc:update-dash] dash: parser: Add syntax stack for recursive parsing

[klibc] [klibc:update-dash] dash: parser: Add syntax stack for recursive parsing

Maybe Matching Threads

Wisdom of the Ancients