/* exclude.c -- exclude file names Copyright (C) 1992-1994, 1997, 1999-2007, 2009-2024 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* Written by Paul Eggert and Sergey Poznyakoff . Thanks to Phil Proudman for improvement suggestions. */ #include #include #include #include #include #include #include #include #include #include "exclude.h" #include "filename.h" #include #include "hash.h" #if GNULIB_MCEL_PREFER # include "mcel.h" #else # include "mbuiter.h" #endif #include "xalloc.h" #if GNULIB_EXCLUDE_SINGLE_THREAD # include "unlocked-io.h" #endif /* Non-GNU systems lack these options, so we don't need to check them. */ #ifndef FNM_CASEFOLD # define FNM_CASEFOLD 0 #endif #ifndef FNM_EXTMATCH # define FNM_EXTMATCH 0 #endif #ifndef FNM_LEADING_DIR # define FNM_LEADING_DIR 0 #endif static_assert (((EXCLUDE_ANCHORED | EXCLUDE_INCLUDE | EXCLUDE_WILDCARDS) & (FNM_PATHNAME | FNM_NOESCAPE | FNM_PERIOD | FNM_LEADING_DIR | FNM_CASEFOLD | FNM_EXTMATCH)) == 0); /* Exclusion patterns are grouped into a singly-linked list of "exclusion segments". Each segment represents a set of patterns that can be matches using the same algorithm. Non-wildcard patterns are kept in hash tables, to speed up searches. Wildcard patterns are stored as arrays of patterns. */ /* An exclude pattern-options pair. The options are fnmatch options ORed with EXCLUDE_* options. */ struct patopts { int options; union { char const *pattern; regex_t re; } v; }; /* An array of pattern-options pairs. */ struct exclude_pattern { struct patopts *exclude; idx_t exclude_alloc; idx_t exclude_count; }; enum exclude_type { exclude_hash, /* a hash table of excluded names */ exclude_pattern /* an array of exclude patterns */ }; struct exclude_segment { struct exclude_segment *next; /* next segment in list */ enum exclude_type type; /* type of this segment */ int options; /* common options for this segment */ union { Hash_table *table; /* for type == exclude_hash */ struct exclude_pattern pat; /* for type == exclude_pattern */ } v; }; struct pattern_buffer { struct pattern_buffer *next; char *base; }; /* The exclude structure keeps a singly-linked list of exclude segments, maintained in reverse order. */ struct exclude { struct exclude_segment *head; struct pattern_buffer *patbuf; }; /* Register BUF in the pattern buffer list of EX. ADD_FUNC (see add_exclude_file and add_exclude_fp below) can use this function if it modifies the pattern, to ensure the allocated memory will be properly reclaimed upon calling free_exclude. */ void exclude_add_pattern_buffer (struct exclude *ex, char *buf) { struct pattern_buffer *pbuf = xmalloc (sizeof *pbuf); pbuf->base = buf; pbuf->next = ex->patbuf; ex->patbuf = pbuf; } /* Return true if STR has or may have wildcards, when matched with OPTIONS. Return false if STR definitely does not have wildcards. */ bool fnmatch_pattern_has_wildcards (const char *str, int options) { while (true) { switch (*str++) { case '.': case '{': case '}': case '(': case ')': if (options & EXCLUDE_REGEX) return true; break; case '\\': if (options & EXCLUDE_REGEX) continue; else str += ! (options & FNM_NOESCAPE) && *str; break; case '+': case '@': case '!': if (options & FNM_EXTMATCH && *str == '(') return true; break; case '?': case '*': case '[': return true; case '\0': return false; } } } static void unescape_pattern (char *str) { char const *q = str; do q += *q == '\\' && q[1]; while ((*str++ = *q++)); } /* Return a newly allocated and empty exclude list. */ struct exclude * new_exclude (void) { return xzalloc (sizeof *new_exclude ()); } /* Calculate the hash of string. */ static size_t string_hasher (void const *data, size_t n_buckets) { return hash_string (data, n_buckets); } /* Ditto, for case-insensitive hashes */ static size_t string_hasher_ci (void const *data, size_t n_buckets) { char const *p = data; size_t value = 0; #if GNULIB_MCEL_PREFER while (*p) { mcel_t g = mcel_scanz (p); value = value * 31 + (c32tolower (g.ch) - g.err); p += g.len; } #else mbui_iterator_t iter; for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter)) { mbchar_t m = mbui_cur (iter); char32_t wc; if (m.wc_valid) wc = c32tolower (m.wc); else wc = *m.ptr; value = value * 31 + wc; } #endif return value % n_buckets; } /* compare two strings for equality */ static bool string_compare (void const *data1, void const *data2) { return strcmp (data1, data2) == 0; } /* compare two strings for equality, case-insensitive */ static bool string_compare_ci (void const *data1, void const *data2) { return mbscasecmp (data1, data2) == 0; } /* Create new exclude segment of given TYPE and OPTIONS, and attach it to the head of EX. */ static void new_exclude_segment (struct exclude *ex, enum exclude_type type, int options) { struct exclude_segment *sp = xmalloc (sizeof (struct exclude_segment)); sp->type = type; sp->options = options; switch (type) { case exclude_pattern: sp->v.pat = (struct exclude_pattern) {0}; break; case exclude_hash: sp->v.table = hash_initialize (0, nullptr, (options & FNM_CASEFOLD ? string_hasher_ci : string_hasher), (options & FNM_CASEFOLD ? string_compare_ci : string_compare), free); break; } sp->next = ex->head; ex->head = sp; } /* Free a single exclude segment */ static void free_exclude_segment (struct exclude_segment *seg) { switch (seg->type) { case exclude_pattern: for (idx_t i = 0; i < seg->v.pat.exclude_count; i++) if (seg->v.pat.exclude[i].options & EXCLUDE_REGEX) regfree (&seg->v.pat.exclude[i].v.re); free (seg->v.pat.exclude); break; case exclude_hash: hash_free (seg->v.table); break; } free (seg); } /* Free the storage associated with an exclude list. */ void free_exclude (struct exclude *ex) { for (struct exclude_segment *seg = ex->head; seg; ) { struct exclude_segment *next = seg->next; free_exclude_segment (seg); seg = next; } for (struct pattern_buffer *pbuf = ex->patbuf; pbuf; ) { struct pattern_buffer *next = pbuf->next; free (pbuf->base); free (pbuf); pbuf = next; } free (ex); } /* Return zero if PATTERN matches F, obeying OPTIONS, except that (unlike fnmatch) wildcards are disabled in PATTERN. */ static int fnmatch_no_wildcards (char const *pattern, char const *f, int options) { if (! (options & FNM_LEADING_DIR)) return ((options & FNM_CASEFOLD) ? mbscasecmp (pattern, f) : strcmp (pattern, f)); else if (! (options & FNM_CASEFOLD)) { idx_t patlen = strlen (pattern); int r = strncmp (pattern, f, patlen); if (! r) { r = f[patlen]; if (r == '/') r = 0; } return r; } else { /* Walk through a copy of F, seeing whether P matches any prefix of F. FIXME: This is an O(N**2) algorithm; it should be O(N). Also, the copy should not be necessary. However, fixing this will probably involve a change to the mbs* API. */ char *fcopy = xstrdup (f); int r; for (char *p = fcopy; ; *p++ = '/') { p = strchr (p, '/'); if (p) *p = '\0'; r = mbscasecmp (pattern, fcopy); if (!p || r <= 0) break; } free (fcopy); return r; } } bool exclude_fnmatch (char const *pattern, char const *f, int options) { int (*matcher) (char const *, char const *, int) = (options & EXCLUDE_WILDCARDS ? fnmatch : fnmatch_no_wildcards); bool matched = matcher (pattern, f, options) == 0; if (! (options & EXCLUDE_ANCHORED)) for (char const *p = f; *p && ! matched; p++) if (*p == '/' && p[1] != '/') matched = matcher (pattern, p + 1, options) == 0; return matched; } static bool exclude_patopts (struct patopts const *opts, char const *f) { int options = opts->options; return (options & EXCLUDE_REGEX ? regexec (&opts->v.re, f, 0, nullptr, 0) == 0 : exclude_fnmatch (opts->v.pattern, f, options)); } /* Return true if the exclude_pattern segment SEG matches F. */ static bool file_pattern_matches (struct exclude_segment const *seg, char const *f) { idx_t exclude_count = seg->v.pat.exclude_count; struct patopts const *exclude = seg->v.pat.exclude; for (idx_t i = 0; i < exclude_count; i++) if (exclude_patopts (exclude + i, f)) return true; return false; } /* Return true if the exclude_hash segment SEG matches F. BUFFER is an auxiliary storage of the same length as F (with nul terminator included) */ static bool file_name_matches (struct exclude_segment const *seg, char const *f, char *buffer) { int options = seg->options; Hash_table *table = seg->v.table; do { /* initialize the pattern */ strcpy (buffer, f); while (true) { if (hash_lookup (table, buffer)) return true; if (options & FNM_LEADING_DIR) { char *p = strrchr (buffer, '/'); if (p) { *p = '\0'; continue; } } break; } if (!(options & EXCLUDE_ANCHORED)) { f = strchr (f, '/'); if (f) f++; } else break; } while (f); return false; } /* Return true if EX excludes F. */ bool excluded_file_name (struct exclude const *ex, char const *f) { /* If no patterns are given, the default is to include. */ if (!ex->head) return false; bool invert = false; char *filename = nullptr; /* Scan through the segments, reporting the status of the first match. The segments are in reverse order, so this reports the status of the last match in the original option list. */ struct exclude_segment *seg; for (seg = ex->head; ; seg = seg->next) { if (seg->type == exclude_hash) { if (!filename) filename = xmalloc (strlen (f) + 1); if (file_name_matches (seg, f, filename)) break; } else { if (file_pattern_matches (seg, f)) break; } if (! seg->next) { /* If patterns are given but none match, the default is the opposite of the last segment (i.e., the first in the original option list). For example, in the command 'grep -r --exclude="a*" --include="*b" pat dir', the first option is --exclude so any file name matching neither a* nor *b is included. */ invert = true; break; } } free (filename); return invert ^ ! (seg->options & EXCLUDE_INCLUDE); } /* Append to EX the exclusion PATTERN with OPTIONS. */ void add_exclude (struct exclude *ex, char const *pattern, int options) { if ((options & (EXCLUDE_REGEX|EXCLUDE_WILDCARDS)) && fnmatch_pattern_has_wildcards (pattern, options)) { if (! (ex->head && ex->head->type == exclude_pattern && ((ex->head->options & EXCLUDE_INCLUDE) == (options & EXCLUDE_INCLUDE)))) new_exclude_segment (ex, exclude_pattern, options); struct exclude_pattern *pat = &ex->head->v.pat; if (pat->exclude_count == pat->exclude_alloc) pat->exclude = xpalloc (pat->exclude, &pat->exclude_alloc, 1, -1, sizeof *pat->exclude); struct patopts *patopts = &pat->exclude[pat->exclude_count++]; patopts->options = options; if (options & EXCLUDE_REGEX) { int rc; int cflags = (REG_NOSUB | REG_EXTENDED | (options & FNM_CASEFOLD ? REG_ICASE : 0)); if (! (options & FNM_LEADING_DIR)) rc = regcomp (&patopts->v.re, pattern, cflags); else for (idx_t len = strlen (pattern); ; len--) { if (len == 0) { rc = 1; break; } if (!ISSLASH (pattern[len - 1])) { static char const patsuffix[] = "(/.*)?"; char *tmp = ximalloc (len + sizeof patsuffix); memcpy (tmp, pattern, len); strcpy (tmp + len, patsuffix); rc = regcomp (&patopts->v.re, tmp, cflags); free (tmp); break; } } if (rc) { pat->exclude_count--; return; } } else { if (options & EXCLUDE_ALLOC) { char *dup = xstrdup (pattern); pattern = dup; exclude_add_pattern_buffer (ex, dup); } patopts->v.pattern = pattern; } } else { int exclude_hash_flags = (EXCLUDE_INCLUDE | EXCLUDE_ANCHORED | FNM_LEADING_DIR | FNM_CASEFOLD); if (! (ex->head && ex->head->type == exclude_hash && ((ex->head->options & exclude_hash_flags) == (options & exclude_hash_flags)))) new_exclude_segment (ex, exclude_hash, options); char *str = xstrdup (pattern); if ((options & (EXCLUDE_WILDCARDS | FNM_NOESCAPE)) == EXCLUDE_WILDCARDS) unescape_pattern (str); if (hash_insert (ex->head->v.table, str) != str) free (str); } } /* Use ADD_FUNC to append to EX the patterns in FILE_NAME, each with OPTIONS. LINE_END terminates each pattern in the file. If LINE_END is a space character, ignore trailing spaces and empty lines in FP. Return -1 (setting errno) on failure, 0 on success. */ int add_exclude_fp (void (*add_func) (struct exclude *, char const *, int, void *), struct exclude *ex, FILE *fp, int options, char line_end, void *data) { char *buf = nullptr; idx_t buf_alloc = 0; idx_t buf_count = 0; for (int c; (c = getc (fp)) != EOF; ) { if (buf_count == buf_alloc) buf = xpalloc (buf, &buf_alloc, 1, -1, 1); buf[buf_count++] = c; } int e = ferror (fp) ? errno : 0; buf = xirealloc (buf, buf_count + 1); buf[buf_count] = line_end; char const *lim = (buf + buf_count + ! (buf_count == 0 || buf[buf_count - 1] == line_end)); exclude_add_pattern_buffer (ex, buf); char *pattern = buf; for (char *p = buf; p < lim; p++) if (*p == line_end) { char *pattern_end = p; if (isspace ((unsigned char) line_end)) { /* Assume that no multi-byte character has a trailing byte that satisfies isspace, and that nobody cares about trailing white space containing non-single-byte characters. If either assumption turns out to be false, presumably the code should be changed to scan forward through the entire pattern, one multi-byte character at a time. */ for (; ; pattern_end--) if (pattern_end == pattern) goto next_pattern; else if (! isspace ((unsigned char) pattern_end[-1])) break; } *pattern_end = '\0'; add_func (ex, pattern, options, data); next_pattern: pattern = p + 1; } errno = e; return e ? -1 : 0; } static void call_addfn (struct exclude *ex, char const *pattern, int options, void *data) { void (**addfnptr) (struct exclude *, char const *, int) = data; (*addfnptr) (ex, pattern, options); } int add_exclude_file (void (*add_func) (struct exclude *, char const *, int), struct exclude *ex, char const *file_name, int options, char line_end) { if (strcmp (file_name, "-") == 0) return add_exclude_fp (call_addfn, ex, stdin, options, line_end, &add_func); FILE *in = fopen (file_name, "re"); if (!in) return -1; int rc = add_exclude_fp (call_addfn, ex, in, options, line_end, &add_func); int e = errno; if (fclose (in) < 0) return -1; errno = e; return rc; }