/* regexprops.c -- document the properties of the regular expressions understood by gnulib. Copyright (C) 2005-2022 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* The output of this program is included in the GNU findutils source distribution. The copying conditions for that file are generated by the copying() function below. */ /* Written by James Youngman, . */ /* config.h must be included first. */ #include /* system headers */ #include #include #include #include #include /* gnulib headers */ #include "progname.h" /* find headers */ #include "regextype.h" static void output (const char *s, int escape) { (void) escape; fputs (s, stdout); } static void newline (void) { output ("\n", 0); } static void content (const char *s) { output (s, 1); } static void literal (const char *s) { output (s, 0); } static void directive (const char *s) { output (s, 0); } static void comment (const char *s) { directive ("@c"); if (s[0]) { literal (" "); literal (s); } newline (); } static void enum_item (const char *s) { newline (); directive ("@item "); literal (s); newline (); } static void begin_subsection (const char *name, const char *next, const char *prev, const char *up) { (void) next; (void) prev; (void) up; newline (); directive ("@node "); content (name); content (" regular expression syntax"); newline (); directive ("@subsection "); output ("@samp{", 0); content (name); output ("}", 0); content (" regular expression syntax"); newline (); } static void begintable_markup (char const *markup) { newline (); directive ("@table "); literal (markup); newline (); } static void endtable (void) { newline (); directive ("@end table"); newline (); } static void beginenum (void) { newline (); directive ("@enumerate"); newline (); } static void endenum (void) { newline (); directive ("@end enumerate"); newline (); } static void newpara (void) { content ("\n\n"); } static void describe_regex_syntax (int options) { newpara (); content ("The character @samp{.} matches any single character"); if ( (options & RE_DOT_NEWLINE) == 0 ) { content (" except newline"); } if (options & RE_DOT_NOT_NULL) { if ( (options & RE_DOT_NEWLINE) == 0 ) content (" and"); else content (" except"); content (" the null character"); } content ("."); newpara (); if (!(options & RE_LIMITED_OPS)) { begintable_markup ("@samp"); if (options & RE_BK_PLUS_QM) { enum_item ("\\+"); content ("indicates that the regular expression should match one" " or more occurrences of the previous atom or regexp."); enum_item ("\\?"); content ("indicates that the regular expression should match zero" " or one occurrence of the previous atom or regexp."); enum_item ("+ and ?"); content ("match themselves.\n"); } else { enum_item ("+"); content ("indicates that the regular expression should match one" " or more occurrences of the previous atom or regexp."); enum_item ("?"); content ("indicates that the regular expression should match zero" " or one occurrence of the previous atom or regexp."); enum_item ("\\+"); literal ("matches a @samp{+}"); enum_item ("\\?"); literal ("matches a @samp{?}."); } endtable (); } newpara (); content ("Bracket expressions are used to match ranges of characters. "); literal ("Bracket expressions where the range is backward, for example @samp{[z-a]}, are "); if (options & RE_NO_EMPTY_RANGES) content ("invalid"); else content ("ignored"); content (". "); if (options & RE_BACKSLASH_ESCAPE_IN_LISTS) literal ("Within square brackets, @samp{\\} can be used to quote " "the following character. "); else literal ("Within square brackets, @samp{\\} is taken literally. "); if (options & RE_CHAR_CLASSES) content ("Character classes are supported; for example " "@samp{[[:digit:]]} will match a single decimal digit.\n"); else literal ("Character classes are not supported, so for example " "you would need to use @samp{[0-9]} " "instead of @samp{[[:digit:]]}.\n"); if (options & RE_HAT_LISTS_NOT_NEWLINE) { literal ("Non-matching lists @samp{[^@dots{}]} do not ever match newline.\n"); } newpara (); if (options & RE_NO_GNU_OPS) { content ("GNU extensions are not supported and so " "@samp{\\w}, @samp{\\W}, @samp{\\<}, @samp{\\>}, @samp{\\b}, @samp{\\B}, @samp{\\`}, and @samp{\\'} " "match " "@samp{w}, @samp{W}, @samp{<}, @samp{>}, @samp{b}, @samp{B}, @samp{`}, and @samp{'} respectively.\n"); } else { content ("GNU extensions are supported:"); beginenum (); enum_item ("@samp{\\w} matches a character within a word"); enum_item ("@samp{\\W} matches a character which is not within a word"); enum_item ("@samp{\\<} matches the beginning of a word"); enum_item ("@samp{\\>} matches the end of a word"); enum_item ("@samp{\\b} matches a word boundary"); enum_item ("@samp{\\B} matches characters which are not a word boundary"); enum_item ("@samp{\\`} matches the beginning of the whole input"); enum_item ("@samp{\\'} matches the end of the whole input"); endenum (); } newpara (); if (options & RE_NO_BK_PARENS) { literal ("Grouping is performed with parentheses @samp{()}. "); if (options & RE_UNMATCHED_RIGHT_PAREN_ORD) literal ("An unmatched @samp{)} matches just itself. "); } else { literal ("Grouping is performed with backslashes followed by parentheses @samp{\\(}, @samp{\\)}. "); } if (options & RE_NO_BK_REFS) { content ("A backslash followed by a digit matches that digit."); } else { literal ("A backslash followed by a digit acts as a back-reference and matches the same thing as the previous grouped expression indicated by that number. For example @samp{\\2} matches the second group expression. The order of group expressions is determined by the position of their opening parenthesis "); if (options & RE_NO_BK_PARENS) literal ("@samp{(}"); else literal ("@samp{\\(}"); content ("."); } newpara (); if (!(options & RE_LIMITED_OPS)) { if (options & RE_NO_BK_VBAR) literal ("The alternation operator is @samp{|}."); else literal ("The alternation operator is @samp{\\|}."); } newpara (); if (options & RE_CONTEXT_INDEP_ANCHORS) { literal ("The characters @samp{^} and @samp{$} always represent the beginning and end of a string respectively, except within square brackets. Within brackets, @samp{^} can be used to invert the membership of the character class being specified.\n"); } else { literal ("The character @samp{^} only represents the beginning of a string when it appears:"); beginenum (); enum_item ("At the beginning of a regular expression"); if (options & RE_NO_BK_PARENS) { enum_item ("After an open-group, signified by @samp{(}"); } else { enum_item ("After an open-group, signified by @samp{\\(}"); } newline (); if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item ("After a newline"); if (options & RE_NO_BK_VBAR ) enum_item ("After the alternation operator @samp{|}"); else enum_item ("After the alternation operator @samp{\\|}"); } endenum (); newpara (); literal ("The character @samp{$} only represents the end of a string when it appears:"); beginenum (); enum_item ("At the end of a regular expression"); if (options & RE_NO_BK_PARENS) { enum_item ("Before a close-group, signified by @samp{)}"); } else { enum_item ("Before a close-group, signified by @samp{\\)}"); } if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item ("Before a newline"); if (options & RE_NO_BK_VBAR) enum_item ("Before the alternation operator @samp{|}"); else enum_item ("Before the alternation operator @samp{\\|}"); } endenum (); } newpara (); if (!(options & RE_LIMITED_OPS) ) { if ((options & RE_CONTEXT_INDEP_OPS) && !(options & RE_CONTEXT_INVALID_OPS)) { literal ("The characters @samp{*}, @samp{+} and @samp{?} are special anywhere in a regular expression.\n"); } else { if (options & RE_BK_PLUS_QM) literal ("@samp{\\*}, @samp{\\+} and @samp{\\?} "); else literal ("@samp{*}, @samp{+} and @samp{?} "); if (options & RE_CONTEXT_INVALID_OPS) { content ("are special at any point in a regular expression except the following places, where they are not allowed:"); } else { content ("are special at any point in a regular expression except:"); } beginenum (); enum_item ("At the beginning of a regular expression"); if (options & RE_NO_BK_PARENS) { enum_item ("After an open-group, signified by @samp{(}"); } else { enum_item ("After an open-group, signified by @samp{\\(}"); } if (!(options & RE_LIMITED_OPS)) { if (options & RE_NEWLINE_ALT) enum_item ("After a newline"); if (options & RE_NO_BK_VBAR) enum_item ("After the alternation operator @samp{|}"); else enum_item ("After the alternation operator @samp{\\|}"); } endenum (); } } newpara (); if (options & RE_INTERVALS) { if (options & RE_NO_BK_BRACES) { literal ("Intervals are specified by @samp{@{} and @samp{@}}.\n"); if (options & RE_INVALID_INTERVAL_ORD) { literal ("Invalid intervals are treated as literals, for example @samp{a@{1} is treated as @samp{a\\@{1}"); } else { literal ("Invalid intervals such as @samp{a@{1z} are not accepted.\n"); } } else { literal ("Intervals are specified by @samp{\\@{} and @samp{\\@}}.\n"); if (options & RE_INVALID_INTERVAL_ORD) { literal ("Invalid intervals are treated as literals, for example @samp{a\\@{1} is treated as @samp{a@{1}"); } else { literal ("Invalid intervals such as @samp{a\\@{1z} are not accepted.\n"); } } } newpara (); if (options & RE_NO_POSIX_BACKTRACKING) { content ("Matching succeeds as soon as the whole pattern is matched, meaning that the result may not be the longest possible match."); } else { content ("The longest possible match is returned; this applies to the regular expression as a whole and (subject to this constraint) to subexpressions within groups."); } newpara (); } static void copying (void) { static const char *copy_para[]= { /* The copyright year number range is with "--" in Texinfo files. */ "Copyright (C) 1994--2022 Free Software Foundation, Inc." ,"" ,"Permission is granted to copy, distribute and/or modify this document" ,"under the terms of the GNU Free Documentation License, Version 1.3 or" ,"any later version published by the Free Software Foundation; with no" ,"Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts." ,"A copy of the license is included in the ``GNU Free" ,"Documentation License'' file as part of this distribution." "" ,NULL }; const char **s = copy_para; while (*s) comment (*s++); } static int ignore (int ix, const unsigned int context) { return 0 == (get_regex_type_context (ix) & context); } static void menu (unsigned int context) { int i; const char *name; output ("@menu\n", 0); for (i=0; get_regex_type_flags (i), name=get_regex_type_name (i); ++i) { if (!ignore (i, context)) { output ("* ", 0); output (name, 0); content (" regular expression syntax"); output ("::", 0); newline (); } } output ("@end menu\n", 0); } static const char * get_next (unsigned int ix, unsigned int context) { const char *next; while (get_regex_type_name (ix)) { if (!ignore (ix, context)) { next = get_regex_type_name (ix); if (NULL == next) return ""; else return next; } ++ix; } return ""; } static void describe_all (const char *contextname, unsigned int context, const char *up) { const char *name, *next, *previous; int regopts; int i, parent; copying (); newline (); literal ("@c this regular expression description is for: "); literal (contextname); newline (); newline (); menu (context); previous = ""; for (i=0; regopts = get_regex_type_flags (i), name=get_regex_type_name (i); ++i) { if (ignore (i, context)) { fprintf (stderr, "Skipping regexp type %s for context %s\n", name, contextname); name = previous; continue; } next = get_next (i+1, context); if (NULL == next) next = ""; begin_subsection (name, next, previous, up); parent = get_regex_type_synonym (i, context); if (parent >= 0) { content ("This is a synonym for "); content (get_regex_type_name (parent)); content ("."); } else { describe_regex_syntax (regopts); } previous = name; } } int main (int argc, char *argv[]) { const char *up = ""; unsigned int context = CONTEXT_ALL; const char *contextname = "all"; if (argc) set_program_name (argv[0]); else set_program_name ("regexprops"); if (argc > 1) { up = argv[1]; } if (argc > 2) { contextname = argv[2]; if (0 == strcmp (contextname, "findutils")) context = CONTEXT_FINDUTILS; else if (0 == strcmp (contextname, "generic")) context = CONTEXT_GENERIC; else if (0 == strcmp (contextname, "all")) context = CONTEXT_ALL; else { fprintf (stderr, "Unexpected context %s", contextname); return 1; } } describe_all (contextname, context, up); return 0; }