2 /*-------------------------------------------------------------------------
5 * lexical scanner for PostgreSQL
9 * The rules in this file must be kept in sync with psql's lexer!!!
11 * The rules are designed so that the scanner never has to backtrack,
12 * in the sense that there is always a rule that can match the input
13 * consumed so far (the rule action may internally throw back some input
14 * with yyless(), however). As explained in the flex manual, this makes
15 * for a useful speed increase --- about a third faster than a plain -CF
16 * lexer, in simple testing. The extra complexity is mostly in the rules
17 * for handling float numbers and continued string literals. If you change
18 * the lexical rules, verify that you haven't broken the no-backtrack
19 * property by running flex with the "-b" option and checking that the
20 * resulting "lex.backup" file says that no backing up is needed.
23 * Portions Copyright (c) 2003-2008, PgPool Global Development Group
24 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
25 * Portions Copyright (c) 1994, Regents of the University of California
28 * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.141 2007/09/12 20:49:27 adunstan Exp $
30 *-------------------------------------------------------------------------
32 #include "pool_parser.h"
41 #define ereport(a,b) yyerror("")
43 #define IS_HIGHBIT_SET(c) 0
45 #include "gramparse.h"
47 /* Not needed now that this file is compiled as part of gram.y */
48 /* #include "parser/parse.h" */
52 #include "pool_memory.h"
55 /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
58 #define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
61 static int xcdepth = 0; /* depth of nesting in slash-star comments */
62 static char *dolqstart; /* current $foo$ quote start string */
65 * GUC variables. This is a DIRECT violation of the warning given at the
66 * head of gram.y, ie flex/bison code must not depend on any GUC variables;
67 * as such, changing their values can induce very unintuitive behavior.
68 * But we shall have to live with it as a short-term thing until the switch
69 * to SQL-standard string syntax is complete.
75 BACKSLASH_QUOTE_SAFE_ENCODING
78 BackslashQuoteType backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
79 int escape_string_warning = true;
80 int standard_conforming_strings = false;
82 static bool warn_on_first_escape;
83 static bool saw_high_bit = false;
86 * literalbuf is used to accumulate literal values when multiple rules
87 * are needed to parse a single literal. Call startlit to reset buffer
88 * to empty, addlit to add text. Note that the buffer is palloc'd and
89 * starts life afresh on every parse cycle.
91 static char *literalbuf; /* expandable buffer */
92 static int literallen; /* actual current length */
93 static int literalalloc; /* current allocated buffer size */
95 #define startlit() (literalbuf[0] = '\0', literallen = 0)
96 static void addlit(char *ytext, int yleng);
97 static void addlitchar(unsigned char ychar);
98 static char *litbufdup(void);
101 static int lexer_errposition(void);
103 static void check_escape_warning(void);
104 static void check_string_escape_warning(unsigned char ychar);
109 * Each call to yylex must set yylloc to the location of the found token
110 * (expressed as a byte offset from the start of the input text).
111 * When we parse a token that requires multiple lexer rules to process,
112 * this should be done in the first such rule, else yylloc will point
113 * into the middle of the token.
115 #define SET_YYLLOC() (yylloc = yytext - scanbuf)
117 /* Handles to the buffer that the lexer uses internally */
118 static YY_BUFFER_STATE scanbufhandle;
119 static char *scanbuf;
121 static unsigned char unescape_single_char(unsigned char c);
122 void yyerror(const char *s);
127 %option never-interactive
131 %option prefix="base_yy"
134 * OK, here is a short description of lex/flex rules behavior.
135 * The longest pattern which matches an input string is always chosen.
136 * For equal-length patterns, the first occurring in the rules list is chosen.
137 * INITIAL is the starting state, to which all non-conditional rules apply.
138 * Exclusive states change parsing rules while the state is active. When in
139 * an exclusive state, only those rules defined for that state apply.
141 * We use exclusive states for quoted strings, extended comments,
142 * and to eliminate parsing troubles for numeric strings.
144 * <xb> bit string literal
145 * <xc> extended C-style comments
146 * <xd> delimited identifiers (double-quoted identifiers)
147 * <xh> hexadecimal numeric string
148 * <xq> standard quoted strings
149 * <xe> extended quoted strings (support backslash escape sequences)
150 * <xdolq> $foo$ quoted strings
162 * In order to make the world safe for Windows and Mac clients as well as
163 * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
164 * sequence will be seen as two successive newlines, but that doesn't cause
165 * any problems. Comments that start with -- and extend to the next
166 * newline are treated as equivalent to a single whitespace character.
168 * NOTE a fine point: if there is no newline following --, we will absorb
169 * everything to the end of the input as a comment. This is correct. Older
170 * versions of Postgres failed to recognize -- as a comment if the input
171 * did not end with a newline.
173 * XXX perhaps \f (formfeed) should be treated as a newline as well?
175 * XXX if you change the set of whitespace characters, fix scanner_isspace()
176 * to agree, and see also the plpgsql lexer.
184 comment ("--"{non_newline}*)
186 whitespace ({space}+|{comment})
189 * SQL requires at least one newline in the whitespace separating
190 * string literals that are to be concatenated. Silly, but who are we
191 * to argue? Note that {whitespace_with_newline} should not have * after
192 * it, whereas {whitespace} should generally have a * after it...
195 special_whitespace ({space}+|{comment}{newline})
196 horiz_whitespace ({horiz_space}|{comment})
197 whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
200 * To ensure that {quotecontinue} can be scanned without having to back up
201 * if the full pattern isn't matched, we include trailing whitespace in
202 * {quotestop}. This matches all cases where {quotecontinue} fails to match,
203 * except for {quote} followed by whitespace and just one "-" (not two,
204 * which would start a {comment}). To cover that we have {quotefail}.
205 * The actions for {quotestop} and {quotefail} must throw back characters
206 * beyond the quote proper.
209 quotestop {quote}{whitespace}*
210 quotecontinue {quote}{whitespace_with_newline}{quote}
211 quotefail {quote}{whitespace}*"-"
214 * It is tempting to scan the string for only those characters
215 * which are allowed. However, this leads to silently swallowed
216 * characters if illegal characters are included in the string.
217 * For example, if xbinside is [01] then B'ABCD' is interpreted
218 * as a zero-length string, and the ABCD' is lost!
219 * Better to pass the string forward and let the input routines
220 * validate the contents.
225 /* Hexadecimal number */
229 /* National character */
232 /* Quoted string that allows backslash escapes */
236 xeoctesc [\\][0-7]{1,3}
237 xehexesc [\\]x[0-9A-Fa-f]{1,2}
240 * xqdouble implements embedded quote, ''''
243 xqdouble {quote}{quote}
246 /* $foo$ style quotes ("dollar quoting")
247 * The quoted string starts with $foo$ where "foo" is an optional string
248 * in the form of an identifier, except that it may not contain "$",
249 * and extends to the first occurrence of an identical string.
250 * There is *no* processing of the quoted text.
252 * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
253 * fails to match its trailing "$".
255 dolq_start [A-Za-z\200-\377_]
256 dolq_cont [A-Za-z\200-\377_0-9]
257 dolqdelim \$({dolq_start}{dolq_cont}*)?\$
258 dolqfailed \${dolq_start}{dolq_cont}*
262 * Allows embedded spaces and other special characters into identifiers.
267 xddouble {dquote}{dquote}
272 * The "extended comment" syntax closely resembles allowable operator syntax.
273 * The tricky part here is to get lex to recognize a string starting with
274 * slash-star as a comment, when interpreting it as an operator would produce
275 * a longer match --- remember lex will prefer a longer match! Also, if we
276 * have something like plus-slash-star, lex will think this is a 3-character
277 * operator whereas we want to see it as a + operator and a comment start.
278 * The solution is two-fold:
279 * 1. append {op_chars}* to xcstart so that it matches as much text as
280 * {operator} would. Then the tie-breaker (first matching rule of same
281 * length) ensures xcstart wins. We put back the extra stuff with yyless()
282 * in case it contains a star-slash that should terminate the comment.
283 * 2. In the operator rule, check for slash-star within the operator, and
284 * if found throw it back with yyless(). This handles the plus-slash-star
286 * Dash-dash comments have similar interactions with the operator rule.
288 xcstart \/\*{op_chars}*
293 ident_start [A-Za-z\200-\377_]
294 ident_cont [A-Za-z\200-\377_0-9\$]
296 identifier {ident_start}{ident_cont}*
301 * "self" is the set of chars that should be returned as single-character
302 * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
303 * which can be one or more characters long (but if a single-char token
304 * appears in the "self" set, it is not to be returned as an Op). Note
305 * that the sets overlap, but each has some chars that are not in the other.
307 * If you change either set, adjust the character lists appearing in the
308 * rule for "operator"!
310 self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
311 op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
314 /* we no longer allow unary minus in numbers.
315 * instead we pass it separately to parser. there it gets
316 * coerced via doNegate() -- Leon aug 20 1999
318 * {realfail1} and {realfail2} are added to prevent the need for scanner
319 * backup when the {real} rule fails to match completely.
323 decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
324 real ({integer}|{decimal})[Ee][-+]?{digit}+
325 realfail1 ({integer}|{decimal})[Ee]
326 realfail2 ({integer}|{decimal})[Ee][-+]
333 * Dollar quoted strings are totally opaque, and no escaping is done on them.
334 * Other quoted strings must allow some special characters such as single-quote
336 * Embedded single-quotes are implemented both in the SQL standard
337 * style of two adjacent single quotes "''" and in the Postgres/Java style
338 * of escaped-quote "\'".
339 * Other embedded escaped characters are matched explicitly and the leading
340 * backslash is dropped from the string.
341 * Note that xcstart must appear before operator, as explained above!
342 * Also whitespace (comment) must appear before operator.
352 /* Set location in case of syntax error in comment */
356 /* Put back any characters past slash-star; see above */
362 /* Put back any characters past slash-star; see above */
385 <xc><<EOF>> { yyerror("unterminated /* comment"); }
389 * At some point we should simply pass the string
390 * forward to the parser and label it there.
391 * In the meantime, place a leading "b" on the string
392 * to mark it for the input routine as a binary string.
403 yylval.str = litbufdup();
408 addlit(yytext, yyleng);
410 <xh>{quotecontinue} |
411 <xb>{quotecontinue} {
414 <xb><<EOF>> { yyerror("unterminated bit string literal"); }
417 /* Hexadecimal bit type.
418 * At some point we should simply pass the string
419 * forward to the parser and label it there.
420 * In the meantime, place a leading "x" on the string
421 * to mark it for the input routine as a hex string.
432 yylval.str = litbufdup();
435 <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
438 /* National character.
439 * We will pass this along as a normal character string,
440 * but preceded with an internally-generated "NCHAR".
442 const ScanKeyword *keyword;
445 yyless(1); /* eat only 'n' this time */
446 /* nchar had better be a keyword! */
447 keyword = ScanKeywordLookup("nchar");
448 yylval.keyword = keyword->name;
449 return keyword->value;
453 warn_on_first_escape = true;
454 saw_high_bit = false;
456 if (standard_conforming_strings)
463 warn_on_first_escape = false;
464 saw_high_bit = false;
473 /* check that the data remains valid if it might have been
474 * made invalid by unescaping any chars.
477 /* pg_verifymbstr(literalbuf, literallen, false);*/
478 yylval.str = litbufdup();
485 addlit(yytext, yyleng);
488 addlit(yytext, yyleng);
491 if (yytext[1] == '\'')
494 if (backslash_quote == BACKSLASH_QUOTE_OFF ||
495 (backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
496 PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
498 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
499 errmsg("unsafe use of \\' in a string literal"),
500 errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
501 lexer_errposition()));
504 check_string_escape_warning(yytext[1]);
505 addlitchar(unescape_single_char(yytext[1]));
508 unsigned char c = strtoul(yytext+1, NULL, 8);
510 check_escape_warning();
512 if (IS_HIGHBIT_SET(c))
516 unsigned char c = strtoul(yytext+2, NULL, 16);
518 check_escape_warning();
520 if (IS_HIGHBIT_SET(c))
523 <xq,xe>{quotecontinue} {
527 /* This is only needed for \ just before EOF */
528 addlitchar(yytext[0]);
530 <xq,xe><<EOF>> { yyerror("unterminated quoted string"); }
534 dolqstart = pstrdup(yytext);
539 /* throw back all but the initial "$" */
541 /* and treat it as {other} */
545 if (strcmp(yytext, dolqstart) == 0)
549 yylval.str = litbufdup();
555 * When we fail to match $...$ to dolqstart, transfer
556 * the $... part to the output, but put back the final
557 * $ for rescanning. Consider $delim$...$junk$delim$
559 addlit(yytext, yyleng-1);
563 <xdolq>{dolqinside} {
564 addlit(yytext, yyleng);
566 <xdolq>{dolqfailed} {
567 addlit(yytext, yyleng);
570 /* This is only needed for $ inside the quoted text */
571 addlitchar(yytext[0]);
573 <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
585 yyerror("zero-length delimited identifier");
587 if (literallen >= NAMEDATALEN)
588 truncate_identifier(ident, literallen, true);
596 addlit(yytext, yyleng);
598 <xd><<EOF>> { yyerror("unterminated quoted identifier"); }
612 * Check for embedded slash-star or dash-dash; those
613 * are comment starts, so operator must stop there.
614 * Note that slash-star or dash-dash at the first
615 * character will match a prior rule, not this one.
618 char *slashstar = strstr(yytext, "/*");
619 char *dashdash = strstr(yytext, "--");
621 if (slashstar && dashdash)
623 /* if both appear, take the first one */
624 if (slashstar > dashdash)
625 slashstar = dashdash;
628 slashstar = dashdash;
630 nchars = slashstar - yytext;
633 * For SQL compatibility, '+' and '-' cannot be the
634 * last char of a multi-char operator unless the operator
635 * contains chars that are not in SQL operators.
636 * The idea is to lex '=-' as two operators, but not
637 * to forbid operator names like '?-' that could not be
638 * sequences of SQL operators.
641 (yytext[nchars-1] == '+' ||
642 yytext[nchars-1] == '-'))
646 for (ic = nchars-2; ic >= 0; ic--)
648 if (strchr("~!@#^&|`?%", yytext[ic]))
652 break; /* found a char that makes it OK */
653 nchars--; /* else remove the +/-, and check again */
660 /* Strip the unwanted chars from the token */
663 * If what we have left is only one char, and it's
664 * one of the characters matching "self", then
665 * return it as a character token the same way
666 * that the "self" rule would have.
669 strchr(",()[].;:+-*/%^<>=", yytext[0]))
674 * Complain if operator is too long. Unlike the case
675 * for identifiers, we make this an error not a notice-
676 * and-truncate, because the odds are we are looking at
677 * a syntactic mistake anyway.
679 if (nchars >= NAMEDATALEN)
680 yyerror("operator too long");
682 /* Convert "!=" operator to "<>" for compatibility */
683 if (strcmp(yytext, "!=") == 0)
684 yylval.str = pstrdup("<>");
686 yylval.str = pstrdup(yytext);
692 yylval.ival = atol(yytext + 1);
702 val = strtol(yytext, &endptr, 10);
703 if (*endptr != '\0' || errno == ERANGE
704 #ifdef HAVE_LONG_INT_64
705 /* if long > 32 bits, check for overflow of int4 */
706 || val != (long) ((int32) val)
710 /* integer too large, treat it as a float */
711 yylval.str = pstrdup(yytext);
719 yylval.str = pstrdup(yytext);
724 yylval.str = pstrdup(yytext);
729 * throw back the [Ee], and treat as {decimal}. Note
730 * that it is possible the input is actually {integer},
731 * but since this case will almost certainly lead to a
732 * syntax error anyway, we don't bother to distinguish.
736 yylval.str = pstrdup(yytext);
740 /* throw back the [Ee][+-], and proceed as above */
743 yylval.str = pstrdup(yytext);
749 const ScanKeyword *keyword;
754 /* Is it a keyword? */
755 keyword = ScanKeywordLookup(yytext);
758 yylval.keyword = keyword->name;
759 return keyword->value;
763 * No. Convert the identifier to lower case, and truncate
766 ident = downcase_truncate_identifier(yytext, yyleng, true);
785 * Report a lexical-analysis-time cursor position, if possible.
787 * This is expected to be used within an ereport() call. The return value
788 * is a dummy (always 0, in fact).
790 * Note that this can only be used for messages from the lexer itself,
791 * since it depends on scanbuf to still be valid.
795 lexer_errposition(void)
799 /* Convert byte offset to character number */
800 pos = pg_mbstrlen_with_len(scanbuf, yylloc) + 1;
801 /* And pass it to the ereport mechanism */
802 return errposition(pos);
808 * Report a lexer or grammar error.
810 * The message's cursor position identifies the most recently lexed token.
811 * This is OK for syntax error messages from the Bison parser, because Bison
812 * parsers report error as soon as the first unparsable token is reached.
813 * Beware of using yyerror for other purposes, as the cursor position might
817 yyerror(const char *message)
819 longjmp(jmpbuffer, 1);
824 * Called before any actual parsing is done
827 scanner_init(const char *str)
829 int slen = strlen(str);
832 * Might be left over after ereport()
834 if (YY_CURRENT_BUFFER)
835 yy_delete_buffer(YY_CURRENT_BUFFER);
838 * Make a scan buffer with special termination needed by flex.
840 scanbuf = palloc(slen + 2);
841 memcpy(scanbuf, str, slen);
842 scanbuf[slen] = scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
843 scanbufhandle = yy_scan_buffer(scanbuf, slen + 2);
845 /* initialize literal buffer to a reasonable but expansible size */
847 literalbuf = (char *) palloc(literalalloc);
855 * Called after parsing is done to clean up after scanner_init()
860 yy_delete_buffer(scanbufhandle);
866 addlit(char *ytext, int yleng)
868 /* enlarge buffer if needed */
869 if ((literallen+yleng) >= literalalloc)
873 } while ((literallen+yleng) >= literalalloc);
874 literalbuf = (char *) repalloc(literalbuf, literalalloc);
876 /* append new data, add trailing null */
877 memcpy(literalbuf+literallen, ytext, yleng);
879 literalbuf[literallen] = '\0';
884 addlitchar(unsigned char ychar)
886 /* enlarge buffer if needed */
887 if ((literallen+1) >= literalalloc)
890 literalbuf = (char *) repalloc(literalbuf, literalalloc);
892 /* append new data, add trailing null */
893 literalbuf[literallen] = ychar;
895 literalbuf[literallen] = '\0';
900 * One might be tempted to write pstrdup(literalbuf) instead of this,
901 * but for long literals this is much faster because the length is
909 new = palloc(literallen + 1);
910 memcpy(new, literalbuf, literallen+1);
916 unescape_single_char(unsigned char c)
918 /* Normally we wouldn't expect to see \n where n has its high bit set
919 * but we set the flag to check the string if we do get it, so
920 * that this doesn't become a way of getting around the coding validity
923 if (IS_HIGHBIT_SET(c))
944 check_string_escape_warning(unsigned char ychar)
949 if (warn_on_first_escape && escape_string_warning)
951 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
952 errmsg("nonstandard use of \\' in a string literal"),
953 errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
954 lexer_errposition()));
955 warn_on_first_escape = false; /* warn only once per string */
957 else if (ychar == '\\')
959 if (warn_on_first_escape && escape_string_warning)
961 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
962 errmsg("nonstandard use of \\\\ in a string literal"),
963 errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
964 lexer_errposition()));
965 warn_on_first_escape = false; /* warn only once per string */
968 check_escape_warning();
973 check_escape_warning(void)
976 if (warn_on_first_escape && escape_string_warning)
978 (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
979 errmsg("nonstandard use of escape in a string literal"),
980 errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
981 lexer_errposition()));
982 warn_on_first_escape = false; /* warn only once per string */
987 * downcase_truncate_identifier() --- do appropriate downcasing and
988 * truncation of an unquoted identifier. Optionally warn of truncation.
990 * Returns a palloc'd string containing the adjusted identifier.
992 * Note: in some usages the passed string is not null-terminated.
994 * Note: the API of this function is designed to allow for downcasing
995 * transformations that increase the string length, but we don't yet
996 * support that. If you want to implement it, you'll need to fix
997 * SplitIdentifierString() in utils/adt/varlena.c.
1000 downcase_truncate_identifier(const char *ident, int len, int warn)
1005 result = palloc(len + 1);
1008 * SQL99 specifies Unicode-aware case normalization, which we don't yet
1009 * have the infrastructure for. Instead we use tolower() to provide a
1010 * locale-aware translation. However, there are some locales where this
1011 * is not right either (eg, Turkish may do strange things with 'i' and
1012 * 'I'). Our current compromise is to use tolower() for characters with
1013 * the high bit set, and use an ASCII-only downcasing for 7-bit
1016 for (i = 0; i < len; i++)
1018 unsigned char ch = (unsigned char) ident[i];
1020 if (ch >= 'A' && ch <= 'Z')
1022 else if (ch >= 0x80 && isupper(ch))
1024 result[i] = (char) ch;
1028 if (i >= NAMEDATALEN)
1029 truncate_identifier(result, i, warn);
1035 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
1037 * The given string is modified in-place, if necessary. A warning is
1038 * issued if requested.
1040 * We require the caller to pass in the string length since this saves a
1041 * strlen() call in some common usages.
1044 truncate_identifier(char *ident, int len, int warn)
1046 if (len >= NAMEDATALEN)
1048 len = strlen(ident); /*pg_mbcliplen(ident, len, NAMEDATALEN - 1);*/
1052 (errcode(ERRCODE_NAME_TOO_LONG),
1053 errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
1054 ident, len, ident)));