PostgreSQL Source Code git master
regc_pg_locale.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * regc_pg_locale.c
4 * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 * and functions to cache the results of wholesale ctype probing.
6 *
7 * This file is #included by regcomp.c; it's not meant to compile standalone.
8 *
9 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
11 *
12 * IDENTIFICATION
13 * src/backend/regex/regc_pg_locale.c
14 *
15 *-------------------------------------------------------------------------
16 */
17
19#include "common/unicode_case.h"
21#include "utils/pg_locale.h"
22#include "utils/pg_locale_c.h"
23
25
26
27/*
28 * pg_set_regex_collation: set collation for these functions to obey
29 *
30 * This is called when beginning compilation or execution of a regexp.
31 * Since there's no need for reentrancy of regexp operations, it's okay
32 * to store the results in static variables.
33 */
34void
36{
38
39 if (!OidIsValid(collation))
40 {
41 /*
42 * This typically means that the parser could not resolve a conflict
43 * of implicit collations, so report it that way.
44 */
46 (errcode(ERRCODE_INDETERMINATE_COLLATION),
47 errmsg("could not determine which collation to use for regular expression"),
48 errhint("Use the COLLATE clause to set the collation explicitly.")));
49 }
50
52
53 if (!locale->deterministic)
55 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
56 errmsg("nondeterministic collations are not supported for regular expressions")));
57
59}
60
61/*
62 * The following functions overlap with those defined in pg_locale.c. XXX:
63 * consider refactor.
64 */
65
66static int
68{
70 return (c <= (pg_wchar) 127 &&
72 else
74}
75
76static int
78{
80 return (c <= (pg_wchar) 127 &&
82 else
84}
85
86static int
88{
90 return (c <= (pg_wchar) 127 &&
92 else
94}
95
96static int
98{
99 /* We define word characters as alnum class plus underscore */
100 if (c == CHR('_'))
101 return 1;
102 return regc_wc_isalnum(c);
103}
104
105static int
107{
109 return (c <= (pg_wchar) 127 &&
111 else
113}
114
115static int
117{
119 return (c <= (pg_wchar) 127 &&
121 else
123}
124
125static int
127{
129 return (c <= (pg_wchar) 127 &&
131 else
133}
134
135static int
137{
139 return (c <= (pg_wchar) 127 &&
141 else
143}
144
145static int
147{
149 return (c <= (pg_wchar) 127 &&
151 else
153}
154
155static int
157{
159 return (c <= (pg_wchar) 127 &&
161 else
163}
164
165static pg_wchar
167{
169 {
170 if (c <= (pg_wchar) 127)
171 return pg_ascii_toupper((unsigned char) c);
172 return c;
173 }
174 else
176}
177
178static pg_wchar
180{
182 {
183 if (c <= (pg_wchar) 127)
184 return pg_ascii_tolower((unsigned char) c);
185 return c;
186 }
187 else
189}
190
191
192/*
193 * These functions cache the results of probing libc's ctype behavior for
194 * all character codes of interest in a given encoding/collation. The
195 * result is provided as a "struct cvec", but notice that the representation
196 * is a touch different from a cvec created by regc_cvec.c: we allocate the
197 * chrs[] and ranges[] arrays separately from the struct so that we can
198 * realloc them larger at need. This is okay since the cvecs made here
199 * should never be freed by freecvec().
200 *
201 * We use malloc not palloc since we mustn't lose control on out-of-memory;
202 * the main regex code expects us to return a failure indication instead.
203 */
204
205typedef int (*regc_wc_probefunc) (pg_wchar c);
206
207typedef struct pg_ctype_cache
208{
209 regc_wc_probefunc probefunc; /* regc_wc_isalpha or a sibling */
210 pg_locale_t locale; /* locale this entry is for */
211 struct cvec cv; /* cache entry contents */
212 struct pg_ctype_cache *next; /* chain link */
214
216
217/*
218 * Add a chr or range to pcc->cv; return false if run out of memory
219 */
220static bool
221store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
222{
223 chr *newchrs;
224
225 if (nchrs > 1)
226 {
227 if (pcc->cv.nranges >= pcc->cv.rangespace)
228 {
229 pcc->cv.rangespace *= 2;
230 newchrs = (chr *) realloc(pcc->cv.ranges,
231 pcc->cv.rangespace * sizeof(chr) * 2);
232 if (newchrs == NULL)
233 return false;
234 pcc->cv.ranges = newchrs;
235 }
236 pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
237 pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
238 pcc->cv.nranges++;
239 }
240 else
241 {
242 assert(nchrs == 1);
243 if (pcc->cv.nchrs >= pcc->cv.chrspace)
244 {
245 pcc->cv.chrspace *= 2;
246 newchrs = (chr *) realloc(pcc->cv.chrs,
247 pcc->cv.chrspace * sizeof(chr));
248 if (newchrs == NULL)
249 return false;
250 pcc->cv.chrs = newchrs;
251 }
252 pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
253 }
254 return true;
255}
256
257/*
258 * Given a probe function (e.g., regc_wc_isalpha) get a struct cvec for all
259 * chrs satisfying the probe function. The active collation is the one
260 * previously set by pg_set_regex_collation. Return NULL if out of memory.
261 *
262 * Note that the result must not be freed or modified by caller.
263 */
264static struct cvec *
266{
267 pg_ctype_cache *pcc;
268 pg_wchar max_chr;
269 pg_wchar cur_chr;
270 int nmatches;
271 chr *newchrs;
272
273 /*
274 * Do we already have the answer cached?
275 */
276 for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
277 {
278 if (pcc->probefunc == probefunc &&
279 pcc->locale == pg_regex_locale)
280 return &pcc->cv;
281 }
282
283 /*
284 * Nope, so initialize some workspace ...
285 */
286 pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
287 if (pcc == NULL)
288 return NULL;
289 pcc->probefunc = probefunc;
290 pcc->locale = pg_regex_locale;
291 pcc->cv.nchrs = 0;
292 pcc->cv.chrspace = 128;
293 pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
294 pcc->cv.nranges = 0;
295 pcc->cv.rangespace = 64;
296 pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
297 if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
298 goto out_of_memory;
299 pcc->cv.cclasscode = cclasscode;
300
301 /*
302 * Decide how many character codes we ought to look through. In general
303 * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
304 * runtime using the "high colormap" mechanism. However, in C locale
305 * there's no need to go further than 127, and if we only have a 1-byte
306 * <ctype.h> API there's no need to go further than that can handle.
307 *
308 * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
309 * output cvec as not having any locale-dependent behavior, since there
310 * will be no need to do any run-time locale checks. (The #if's here
311 * would always be true for production values of MAX_SIMPLE_CHR, but it's
312 * useful to allow it to be small for testing purposes.)
313 */
315 {
316#if MAX_SIMPLE_CHR >= 127
317 max_chr = (pg_wchar) 127;
318 pcc->cv.cclasscode = -1;
319#else
320 max_chr = (pg_wchar) MAX_SIMPLE_CHR;
321#endif
322 }
323 else
324 {
325 if (pg_regex_locale->ctype->max_chr != 0 &&
327 {
328 max_chr = pg_regex_locale->ctype->max_chr;
329 pcc->cv.cclasscode = -1;
330 }
331 else
332 max_chr = (pg_wchar) MAX_SIMPLE_CHR;
333 }
334
335 /*
336 * And scan 'em ...
337 */
338 nmatches = 0; /* number of consecutive matches */
339
340 for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
341 {
342 if ((*probefunc) (cur_chr))
343 nmatches++;
344 else if (nmatches > 0)
345 {
346 if (!store_match(pcc, cur_chr - nmatches, nmatches))
347 goto out_of_memory;
348 nmatches = 0;
349 }
350 }
351
352 if (nmatches > 0)
353 if (!store_match(pcc, cur_chr - nmatches, nmatches))
354 goto out_of_memory;
355
356 /*
357 * We might have allocated more memory than needed, if so free it
358 */
359 if (pcc->cv.nchrs == 0)
360 {
361 free(pcc->cv.chrs);
362 pcc->cv.chrs = NULL;
363 pcc->cv.chrspace = 0;
364 }
365 else if (pcc->cv.nchrs < pcc->cv.chrspace)
366 {
367 newchrs = (chr *) realloc(pcc->cv.chrs,
368 pcc->cv.nchrs * sizeof(chr));
369 if (newchrs == NULL)
370 goto out_of_memory;
371 pcc->cv.chrs = newchrs;
372 pcc->cv.chrspace = pcc->cv.nchrs;
373 }
374 if (pcc->cv.nranges == 0)
375 {
376 free(pcc->cv.ranges);
377 pcc->cv.ranges = NULL;
378 pcc->cv.rangespace = 0;
379 }
380 else if (pcc->cv.nranges < pcc->cv.rangespace)
381 {
382 newchrs = (chr *) realloc(pcc->cv.ranges,
383 pcc->cv.nranges * sizeof(chr) * 2);
384 if (newchrs == NULL)
385 goto out_of_memory;
386 pcc->cv.ranges = newchrs;
387 pcc->cv.rangespace = pcc->cv.nranges;
388 }
389
390 /*
391 * Success, link it into cache chain
392 */
395
396 return &pcc->cv;
397
398 /*
399 * Failure, clean up
400 */
401out_of_memory:
402 free(pcc->cv.chrs);
403 free(pcc->cv.ranges);
404 free(pcc);
405
406 return NULL;
407}
#define OidIsValid(objectId)
Definition: c.h:779
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
static char * locale
Definition: initdb.c:140
unsigned int pg_wchar
Definition: mbprint.c:31
pg_locale_t pg_newlocale_from_collation(Oid collid)
Definition: pg_locale.c:1186
#define PG_ISLOWER
Definition: pg_locale_c.h:23
#define PG_ISPRINT
Definition: pg_locale_c.h:25
#define PG_ISALPHA
Definition: pg_locale_c.h:20
#define PG_ISGRAPH
Definition: pg_locale_c.h:24
#define PG_ISPUNCT
Definition: pg_locale_c.h:26
#define PG_ISDIGIT
Definition: pg_locale_c.h:19
#define PG_ISUPPER
Definition: pg_locale_c.h:22
#define PG_ISALNUM
Definition: pg_locale_c.h:21
#define PG_ISSPACE
Definition: pg_locale_c.h:27
static const unsigned char pg_char_properties[128]
Definition: pg_locale_c.h:29
unsigned char pg_ascii_tolower(unsigned char ch)
Definition: pgstrcasecmp.c:146
unsigned char pg_ascii_toupper(unsigned char ch)
Definition: pgstrcasecmp.c:135
unsigned int Oid
Definition: postgres_ext.h:32
char * c
int(* regc_wc_probefunc)(pg_wchar c)
static int regc_wc_isdigit(pg_wchar c)
static struct cvec * regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
static int regc_wc_ispunct(pg_wchar c)
static pg_ctype_cache * pg_ctype_cache_list
static int regc_wc_isalnum(pg_wchar c)
static bool store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
static int regc_wc_islower(pg_wchar c)
static pg_wchar regc_wc_toupper(pg_wchar c)
static int regc_wc_isupper(pg_wchar c)
static int regc_wc_isalpha(pg_wchar c)
static int regc_wc_isgraph(pg_wchar c)
static int regc_wc_isprint(pg_wchar c)
void pg_set_regex_collation(Oid collation)
static int regc_wc_isword(pg_wchar c)
static pg_locale_t pg_regex_locale
static pg_wchar regc_wc_tolower(pg_wchar c)
static int regc_wc_isspace(pg_wchar c)
struct pg_ctype_cache pg_ctype_cache
#define MAX_SIMPLE_CHR
Definition: regcustom.h:87
pg_wchar chr
Definition: regcustom.h:59
#define CHR(c)
Definition: regcustom.h:62
#define assert(x)
Definition: regcustom.h:56
pg_wchar(* wc_toupper)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:111
pg_wchar max_chr
Definition: pg_locale.h:128
bool(* wc_ispunct)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:108
bool(* wc_isprint)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:107
bool(* wc_isalpha)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:102
pg_wchar(* wc_tolower)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:112
bool(* wc_isupper)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:104
bool(* wc_isspace)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:109
bool(* wc_isgraph)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:106
bool(* wc_islower)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:105
bool(* wc_isalnum)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:103
bool(* wc_isdigit)(pg_wchar wc, pg_locale_t locale)
Definition: pg_locale.h:101
Definition: regguts.h:279
int chrspace
Definition: regguts.h:281
int nchrs
Definition: regguts.h:280
int rangespace
Definition: regguts.h:284
chr * chrs
Definition: regguts.h:282
chr * ranges
Definition: regguts.h:285
int cclasscode
Definition: regguts.h:286
int nranges
Definition: regguts.h:283
regc_wc_probefunc probefunc
struct pg_ctype_cache * next
pg_locale_t locale
struct cvec cv
const struct ctype_methods * ctype
Definition: pg_locale.h:153