PostgreSQL Source Code git master
pg_locale_libc.c
Go to the documentation of this file.
1/*-----------------------------------------------------------------------
2 *
3 * PostgreSQL locale utilities for libc
4 *
5 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
6 *
7 * src/backend/utils/adt/pg_locale_libc.c
8 *
9 *-----------------------------------------------------------------------
10 */
11
12#include "postgres.h"
13
14#include <limits.h>
15#include <wctype.h>
16
17#include "access/htup_details.h"
18#include "catalog/pg_database.h"
20#include "mb/pg_wchar.h"
21#include "miscadmin.h"
22#include "utils/builtins.h"
23#include "utils/formatting.h"
24#include "utils/memutils.h"
25#include "utils/pg_locale.h"
26#include "utils/syscache.h"
27
28#ifdef __GLIBC__
29#include <gnu/libc-version.h>
30#endif
31
32#ifdef WIN32
33#include <shlwapi.h>
34#endif
35
36/*
37 * For the libc provider, to provide as much functionality as possible on a
38 * variety of platforms without going so far as to implement everything from
39 * scratch, we use several implementation strategies depending on the
40 * situation:
41 *
42 * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
43 * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
44 * collations don't give a fig about multibyte characters.
45 *
46 * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
47 * This assumes that every platform uses Unicode codepoints directly
48 * as the wchar_t representation of Unicode. On some platforms
49 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
50 *
51 * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
52 * values up to 255, and punt for values above that. This is 100% correct
53 * only in single-byte encodings such as LATINn. However, non-Unicode
54 * multibyte encodings are mostly Far Eastern character sets for which the
55 * properties being tested here aren't very relevant for higher code values
56 * anyway. The difficulty with using the <wctype.h> functions with
57 * non-Unicode multibyte encodings is that we can have no certainty that
58 * the platform's wchar_t representation matches what we do in pg_wchar
59 * conversions.
60 *
61 * As a special case, in the "default" collation, (2) and (3) force ASCII
62 * letters to follow ASCII upcase/downcase rules, while in a non-default
63 * collation we just let the library functions do what they will. The case
64 * where this matters is treatment of I/i in Turkish, and the behavior is
65 * meant to match the upper()/lower() SQL functions.
66 *
67 * We store the active collation setting in static variables. In principle
68 * it could be passed down to here via the regex library's "struct vars" data
69 * structure; but that would require somewhat invasive changes in the regex
70 * library, and right now there's no real benefit to be gained from that.
71 *
72 * NB: the coding here assumes pg_wchar is an unsigned type.
73 */
74
75/*
76 * Size of stack buffer to use for string transformations, used to avoid heap
77 * allocations in typical cases. This should be large enough that most strings
78 * will fit, but small enough that we feel comfortable putting it on the
79 * stack.
80 */
81#define TEXTBUFLEN 1024
82
84
85static int strncoll_libc(const char *arg1, ssize_t len1,
86 const char *arg2, ssize_t len2,
88static size_t strnxfrm_libc(char *dest, size_t destsize,
89 const char *src, ssize_t srclen,
91extern char *get_collation_actual_version_libc(const char *collcollate);
92static locale_t make_libc_collator(const char *collate,
93 const char *ctype);
94
95#ifdef WIN32
96static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1,
97 const char *arg2, ssize_t len2,
99#endif
100
101static size_t char2wchar(wchar_t *to, size_t tolen, const char *from,
102 size_t fromlen, locale_t loc);
103
104static size_t strlower_libc_sb(char *dest, size_t destsize,
105 const char *src, ssize_t srclen,
107static size_t strlower_libc_mb(char *dest, size_t destsize,
108 const char *src, ssize_t srclen,
110static size_t strtitle_libc_sb(char *dest, size_t destsize,
111 const char *src, ssize_t srclen,
113static size_t strtitle_libc_mb(char *dest, size_t destsize,
114 const char *src, ssize_t srclen,
116static size_t strupper_libc_sb(char *dest, size_t destsize,
117 const char *src, ssize_t srclen,
119static size_t strupper_libc_mb(char *dest, size_t destsize,
120 const char *src, ssize_t srclen,
122
123static bool
125{
126 return isdigit_l((unsigned char) wc, locale->lt);
127}
128
129static bool
131{
132 return isalpha_l((unsigned char) wc, locale->lt);
133}
134
135static bool
137{
138 return isalnum_l((unsigned char) wc, locale->lt);
139}
140
141static bool
143{
144 return isupper_l((unsigned char) wc, locale->lt);
145}
146
147static bool
149{
150 return islower_l((unsigned char) wc, locale->lt);
151}
152
153static bool
155{
156 return isgraph_l((unsigned char) wc, locale->lt);
157}
158
159static bool
161{
162 return isprint_l((unsigned char) wc, locale->lt);
163}
164
165static bool
167{
168 return ispunct_l((unsigned char) wc, locale->lt);
169}
170
171static bool
173{
174 return isspace_l((unsigned char) wc, locale->lt);
175}
176
177static bool
179{
180#ifndef WIN32
181 return isxdigit_l((unsigned char) wc, locale->lt);
182#else
183 return _isxdigit_l((unsigned char) wc, locale->lt);
184#endif
185}
186
187static bool
189{
190 return iswdigit_l((wint_t) wc, locale->lt);
191}
192
193static bool
195{
196 return iswalpha_l((wint_t) wc, locale->lt);
197}
198
199static bool
201{
202 return iswalnum_l((wint_t) wc, locale->lt);
203}
204
205static bool
207{
208 return iswupper_l((wint_t) wc, locale->lt);
209}
210
211static bool
213{
214 return iswlower_l((wint_t) wc, locale->lt);
215}
216
217static bool
219{
220 return iswgraph_l((wint_t) wc, locale->lt);
221}
222
223static bool
225{
226 return iswprint_l((wint_t) wc, locale->lt);
227}
228
229static bool
231{
232 return iswpunct_l((wint_t) wc, locale->lt);
233}
234
235static bool
237{
238 return iswspace_l((wint_t) wc, locale->lt);
239}
240
241static bool
243{
244#ifndef WIN32
245 return iswxdigit_l((wint_t) wc, locale->lt);
246#else
247 return _iswxdigit_l((wint_t) wc, locale->lt);
248#endif
249}
250
251static char
253{
255 return tolower_l(ch, locale->lt);
256}
257
258static bool
260{
261 bool is_multibyte = pg_database_encoding_max_length() > 1;
262
263 if (is_multibyte && IS_HIGHBIT_SET(ch))
264 return true;
265 else
266 return isalpha_l((unsigned char) ch, locale->lt);
267}
268
269static pg_wchar
271{
273
274 /* force C behavior for ASCII characters, per comments above */
275 if (locale->is_default && wc <= (pg_wchar) 127)
276 return pg_ascii_toupper((unsigned char) wc);
277 if (wc <= (pg_wchar) UCHAR_MAX)
278 return toupper_l((unsigned char) wc, locale->lt);
279 else
280 return wc;
281}
282
283static pg_wchar
285{
287
288 /* force C behavior for ASCII characters, per comments above */
289 if (locale->is_default && wc <= (pg_wchar) 127)
290 return pg_ascii_toupper((unsigned char) wc);
291 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
292 return towupper_l((wint_t) wc, locale->lt);
293 else
294 return wc;
295}
296
297static pg_wchar
299{
301
302 /* force C behavior for ASCII characters, per comments above */
303 if (locale->is_default && wc <= (pg_wchar) 127)
304 return pg_ascii_tolower((unsigned char) wc);
305 if (wc <= (pg_wchar) UCHAR_MAX)
306 return tolower_l((unsigned char) wc, locale->lt);
307 else
308 return wc;
309}
310
311static pg_wchar
313{
315
316 /* force C behavior for ASCII characters, per comments above */
317 if (locale->is_default && wc <= (pg_wchar) 127)
318 return pg_ascii_tolower((unsigned char) wc);
319 if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
320 return towlower_l((wint_t) wc, locale->lt);
321 else
322 return wc;
323}
324
327 .strtitle = strtitle_libc_sb,
328 .strupper = strupper_libc_sb,
329 .wc_isdigit = wc_isdigit_libc_sb,
330 .wc_isalpha = wc_isalpha_libc_sb,
331 .wc_isalnum = wc_isalnum_libc_sb,
332 .wc_isupper = wc_isupper_libc_sb,
333 .wc_islower = wc_islower_libc_sb,
334 .wc_isgraph = wc_isgraph_libc_sb,
335 .wc_isprint = wc_isprint_libc_sb,
336 .wc_ispunct = wc_ispunct_libc_sb,
337 .wc_isspace = wc_isspace_libc_sb,
338 .wc_isxdigit = wc_isxdigit_libc_sb,
339 .char_is_cased = char_is_cased_libc,
340 .char_tolower = char_tolower_libc,
341 .wc_toupper = toupper_libc_sb,
342 .wc_tolower = tolower_libc_sb,
343 .max_chr = UCHAR_MAX,
344};
345
346/*
347 * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
348 * single-byte semantics for pattern matching.
349 */
352 .strtitle = strtitle_libc_mb,
353 .strupper = strupper_libc_mb,
354 .wc_isdigit = wc_isdigit_libc_sb,
355 .wc_isalpha = wc_isalpha_libc_sb,
356 .wc_isalnum = wc_isalnum_libc_sb,
357 .wc_isupper = wc_isupper_libc_sb,
358 .wc_islower = wc_islower_libc_sb,
359 .wc_isgraph = wc_isgraph_libc_sb,
360 .wc_isprint = wc_isprint_libc_sb,
361 .wc_ispunct = wc_ispunct_libc_sb,
362 .wc_isspace = wc_isspace_libc_sb,
363 .wc_isxdigit = wc_isxdigit_libc_sb,
364 .char_is_cased = char_is_cased_libc,
365 .char_tolower = char_tolower_libc,
366 .wc_toupper = toupper_libc_sb,
367 .wc_tolower = tolower_libc_sb,
368 .max_chr = UCHAR_MAX,
369};
370
373 .strtitle = strtitle_libc_mb,
374 .strupper = strupper_libc_mb,
375 .wc_isdigit = wc_isdigit_libc_mb,
376 .wc_isalpha = wc_isalpha_libc_mb,
377 .wc_isalnum = wc_isalnum_libc_mb,
378 .wc_isupper = wc_isupper_libc_mb,
379 .wc_islower = wc_islower_libc_mb,
380 .wc_isgraph = wc_isgraph_libc_mb,
381 .wc_isprint = wc_isprint_libc_mb,
382 .wc_ispunct = wc_ispunct_libc_mb,
383 .wc_isspace = wc_isspace_libc_mb,
384 .wc_isxdigit = wc_isxdigit_libc_mb,
385 .char_is_cased = char_is_cased_libc,
386 .char_tolower = char_tolower_libc,
387 .wc_toupper = toupper_libc_mb,
388 .wc_tolower = tolower_libc_mb,
389};
390
393 .strnxfrm = strnxfrm_libc,
394 .strnxfrm_prefix = NULL,
395
396 /*
397 * Unfortunately, it seems that strxfrm() for non-C collations is broken
398 * on many common platforms; testing of multiple versions of glibc reveals
399 * that, for many locales, strcoll() and strxfrm() do not return
400 * consistent results. While no other libc other than Cygwin has so far
401 * been shown to have a problem, we take the conservative course of action
402 * for right now and disable this categorically. (Users who are certain
403 * this isn't a problem on their system can define TRUST_STRXFRM.)
404 */
405#ifdef TRUST_STRXFRM
406 .strxfrm_is_safe = true,
407#else
408 .strxfrm_is_safe = false,
409#endif
410};
411
412#ifdef WIN32
413static const struct collate_methods collate_methods_libc_win32_utf8 = {
414 .strncoll = strncoll_libc_win32_utf8,
415 .strnxfrm = strnxfrm_libc,
416 .strnxfrm_prefix = NULL,
417#ifdef TRUST_STRXFRM
418 .strxfrm_is_safe = true,
419#else
420 .strxfrm_is_safe = false,
421#endif
422};
423#endif
424
425static size_t
426strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
428{
429 if (srclen < 0)
430 srclen = strlen(src);
431
432 if (srclen + 1 <= destsize)
433 {
434 locale_t loc = locale->lt;
435 char *p;
436
437 if (srclen + 1 > destsize)
438 return srclen;
439
440 memcpy(dest, src, srclen);
441 dest[srclen] = '\0';
442
443 /*
444 * Note: we assume that tolower_l() will not be so broken as to need
445 * an isupper_l() guard test. When using the default collation, we
446 * apply the traditional Postgres behavior that forces ASCII-style
447 * treatment of I/i, but in non-default collations you get exactly
448 * what the collation says.
449 */
450 for (p = dest; *p; p++)
451 {
452 if (locale->is_default)
453 *p = pg_tolower((unsigned char) *p);
454 else
455 *p = tolower_l((unsigned char) *p, loc);
456 }
457 }
458
459 return srclen;
460}
461
462static size_t
463strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
465{
466 locale_t loc = locale->lt;
467 size_t result_size;
468 wchar_t *workspace;
469 char *result;
470 size_t curr_char;
471 size_t max_size;
472
473 if (srclen < 0)
474 srclen = strlen(src);
475
476 /* Overflow paranoia */
477 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
479 (errcode(ERRCODE_OUT_OF_MEMORY),
480 errmsg("out of memory")));
481
482 /* Output workspace cannot have more codes than input bytes */
483 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
484
485 char2wchar(workspace, srclen + 1, src, srclen, loc);
486
487 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
488 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
489
490 /*
491 * Make result large enough; case change might change number of bytes
492 */
493 max_size = curr_char * pg_database_encoding_max_length();
494 result = palloc(max_size + 1);
495
496 result_size = wchar2char(result, workspace, max_size + 1, loc);
497
498 if (result_size + 1 > destsize)
499 return result_size;
500
501 memcpy(dest, result, result_size);
502 dest[result_size] = '\0';
503
504 pfree(workspace);
505 pfree(result);
506
507 return result_size;
508}
509
510static size_t
511strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
513{
514 if (srclen < 0)
515 srclen = strlen(src);
516
517 if (srclen + 1 <= destsize)
518 {
519 locale_t loc = locale->lt;
520 int wasalnum = false;
521 char *p;
522
523 memcpy(dest, src, srclen);
524 dest[srclen] = '\0';
525
526 /*
527 * Note: we assume that toupper_l()/tolower_l() will not be so broken
528 * as to need guard tests. When using the default collation, we apply
529 * the traditional Postgres behavior that forces ASCII-style treatment
530 * of I/i, but in non-default collations you get exactly what the
531 * collation says.
532 */
533 for (p = dest; *p; p++)
534 {
535 if (locale->is_default)
536 {
537 if (wasalnum)
538 *p = pg_tolower((unsigned char) *p);
539 else
540 *p = pg_toupper((unsigned char) *p);
541 }
542 else
543 {
544 if (wasalnum)
545 *p = tolower_l((unsigned char) *p, loc);
546 else
547 *p = toupper_l((unsigned char) *p, loc);
548 }
549 wasalnum = isalnum_l((unsigned char) *p, loc);
550 }
551 }
552
553 return srclen;
554}
555
556static size_t
557strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
559{
560 locale_t loc = locale->lt;
561 int wasalnum = false;
562 size_t result_size;
563 wchar_t *workspace;
564 char *result;
565 size_t curr_char;
566 size_t max_size;
567
568 if (srclen < 0)
569 srclen = strlen(src);
570
571 /* Overflow paranoia */
572 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
574 (errcode(ERRCODE_OUT_OF_MEMORY),
575 errmsg("out of memory")));
576
577 /* Output workspace cannot have more codes than input bytes */
578 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
579
580 char2wchar(workspace, srclen + 1, src, srclen, loc);
581
582 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
583 {
584 if (wasalnum)
585 workspace[curr_char] = towlower_l(workspace[curr_char], loc);
586 else
587 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
588 wasalnum = iswalnum_l(workspace[curr_char], loc);
589 }
590
591 /*
592 * Make result large enough; case change might change number of bytes
593 */
594 max_size = curr_char * pg_database_encoding_max_length();
595 result = palloc(max_size + 1);
596
597 result_size = wchar2char(result, workspace, max_size + 1, loc);
598
599 if (result_size + 1 > destsize)
600 return result_size;
601
602 memcpy(dest, result, result_size);
603 dest[result_size] = '\0';
604
605 pfree(workspace);
606 pfree(result);
607
608 return result_size;
609}
610
611static size_t
612strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
614{
615 if (srclen < 0)
616 srclen = strlen(src);
617
618 if (srclen + 1 <= destsize)
619 {
620 locale_t loc = locale->lt;
621 char *p;
622
623 memcpy(dest, src, srclen);
624 dest[srclen] = '\0';
625
626 /*
627 * Note: we assume that toupper_l() will not be so broken as to need
628 * an islower_l() guard test. When using the default collation, we
629 * apply the traditional Postgres behavior that forces ASCII-style
630 * treatment of I/i, but in non-default collations you get exactly
631 * what the collation says.
632 */
633 for (p = dest; *p; p++)
634 {
635 if (locale->is_default)
636 *p = pg_toupper((unsigned char) *p);
637 else
638 *p = toupper_l((unsigned char) *p, loc);
639 }
640 }
641
642 return srclen;
643}
644
645static size_t
646strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen,
648{
649 locale_t loc = locale->lt;
650 size_t result_size;
651 wchar_t *workspace;
652 char *result;
653 size_t curr_char;
654 size_t max_size;
655
656 if (srclen < 0)
657 srclen = strlen(src);
658
659 /* Overflow paranoia */
660 if ((srclen + 1) > (INT_MAX / sizeof(wchar_t)))
662 (errcode(ERRCODE_OUT_OF_MEMORY),
663 errmsg("out of memory")));
664
665 /* Output workspace cannot have more codes than input bytes */
666 workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t));
667
668 char2wchar(workspace, srclen + 1, src, srclen, loc);
669
670 for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
671 workspace[curr_char] = towupper_l(workspace[curr_char], loc);
672
673 /*
674 * Make result large enough; case change might change number of bytes
675 */
676 max_size = curr_char * pg_database_encoding_max_length();
677 result = palloc(max_size + 1);
678
679 result_size = wchar2char(result, workspace, max_size + 1, loc);
680
681 if (result_size + 1 > destsize)
682 return result_size;
683
684 memcpy(dest, result, result_size);
685 dest[result_size] = '\0';
686
687 pfree(workspace);
688 pfree(result);
689
690 return result_size;
691}
692
695{
696 const char *collate;
697 const char *ctype;
698 locale_t loc;
699 pg_locale_t result;
700
701 if (collid == DEFAULT_COLLATION_OID)
702 {
703 HeapTuple tp;
704 Datum datum;
705
707 if (!HeapTupleIsValid(tp))
708 elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
709 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
710 Anum_pg_database_datcollate);
711 collate = TextDatumGetCString(datum);
712 datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
713 Anum_pg_database_datctype);
714 ctype = TextDatumGetCString(datum);
715
716 ReleaseSysCache(tp);
717 }
718 else
719 {
720 HeapTuple tp;
721 Datum datum;
722
724 if (!HeapTupleIsValid(tp))
725 elog(ERROR, "cache lookup failed for collation %u", collid);
726
727 datum = SysCacheGetAttrNotNull(COLLOID, tp,
728 Anum_pg_collation_collcollate);
729 collate = TextDatumGetCString(datum);
730 datum = SysCacheGetAttrNotNull(COLLOID, tp,
731 Anum_pg_collation_collctype);
732 ctype = TextDatumGetCString(datum);
733
734 ReleaseSysCache(tp);
735 }
736
737
738 loc = make_libc_collator(collate, ctype);
739
740 result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
741 result->deterministic = true;
742 result->collate_is_c = (strcmp(collate, "C") == 0) ||
743 (strcmp(collate, "POSIX") == 0);
744 result->ctype_is_c = (strcmp(ctype, "C") == 0) ||
745 (strcmp(ctype, "POSIX") == 0);
746 result->lt = loc;
747 if (!result->collate_is_c)
748 {
749#ifdef WIN32
751 result->collate = &collate_methods_libc_win32_utf8;
752 else
753#endif
754 result->collate = &collate_methods_libc;
755 }
756 if (!result->ctype_is_c)
757 {
762 else
763 result->ctype = &ctype_methods_libc_sb;
764 }
765
766 return result;
767}
768
769/*
770 * Create a locale_t with the given collation and ctype.
771 *
772 * The "C" and "POSIX" locales are not actually handled by libc, so return
773 * NULL.
774 *
775 * Ensure that no path leaks a locale_t.
776 */
777static locale_t
778make_libc_collator(const char *collate, const char *ctype)
779{
780 locale_t loc = 0;
781
782 if (strcmp(collate, ctype) == 0)
783 {
784 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
785 {
786 /* Normal case where they're the same */
787 errno = 0;
788#ifndef WIN32
789 loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
790 NULL);
791#else
792 loc = _create_locale(LC_ALL, collate);
793#endif
794 if (!loc)
796 }
797 }
798 else
799 {
800#ifndef WIN32
801 /* We need two newlocale() steps */
802 locale_t loc1 = 0;
803
804 if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
805 {
806 errno = 0;
807 loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
808 if (!loc1)
810 }
811
812 if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
813 {
814 errno = 0;
815 loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
816 if (!loc)
817 {
818 if (loc1)
819 freelocale(loc1);
821 }
822 }
823 else
824 loc = loc1;
825#else
826
827 /*
828 * XXX The _create_locale() API doesn't appear to support this. Could
829 * perhaps be worked around by changing pg_locale_t to contain two
830 * separate fields.
831 */
833 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
834 errmsg("collations with different collate and ctype values are not supported on this platform")));
835#endif
836 }
837
838 return loc;
839}
840
841/*
842 * strncoll_libc
843 *
844 * NUL-terminate arguments, if necessary, and pass to strcoll_l().
845 *
846 * An input string length of -1 means that it's already NUL-terminated.
847 */
848int
849strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
851{
852 char sbuf[TEXTBUFLEN];
853 char *buf = sbuf;
854 size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
855 size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
856 const char *arg1n;
857 const char *arg2n;
858 int result;
859
860 if (bufsize1 + bufsize2 > TEXTBUFLEN)
861 buf = palloc(bufsize1 + bufsize2);
862
863 /* nul-terminate arguments if necessary */
864 if (len1 == -1)
865 {
866 arg1n = arg1;
867 }
868 else
869 {
870 char *buf1 = buf;
871
872 memcpy(buf1, arg1, len1);
873 buf1[len1] = '\0';
874 arg1n = buf1;
875 }
876
877 if (len2 == -1)
878 {
879 arg2n = arg2;
880 }
881 else
882 {
883 char *buf2 = buf + bufsize1;
884
885 memcpy(buf2, arg2, len2);
886 buf2[len2] = '\0';
887 arg2n = buf2;
888 }
889
890 result = strcoll_l(arg1n, arg2n, locale->lt);
891
892 if (buf != sbuf)
893 pfree(buf);
894
895 return result;
896}
897
898/*
899 * strnxfrm_libc
900 *
901 * NUL-terminate src, if necessary, and pass to strxfrm_l().
902 *
903 * A source length of -1 means that it's already NUL-terminated.
904 */
905size_t
906strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
908{
909 char sbuf[TEXTBUFLEN];
910 char *buf = sbuf;
911 size_t bufsize = srclen + 1;
912 size_t result;
913
914 if (srclen == -1)
915 return strxfrm_l(dest, src, destsize, locale->lt);
916
917 if (bufsize > TEXTBUFLEN)
918 buf = palloc(bufsize);
919
920 /* nul-terminate argument */
921 memcpy(buf, src, srclen);
922 buf[srclen] = '\0';
923
924 result = strxfrm_l(dest, buf, destsize, locale->lt);
925
926 if (buf != sbuf)
927 pfree(buf);
928
929 /* if dest is defined, it should be nul-terminated */
930 Assert(result >= destsize || dest[result] == '\0');
931
932 return result;
933}
934
935char *
936get_collation_actual_version_libc(const char *collcollate)
937{
938 char *collversion = NULL;
939
940 if (pg_strcasecmp("C", collcollate) != 0 &&
941 pg_strncasecmp("C.", collcollate, 2) != 0 &&
942 pg_strcasecmp("POSIX", collcollate) != 0)
943 {
944#if defined(__GLIBC__)
945 /* Use the glibc version because we don't have anything better. */
946 collversion = pstrdup(gnu_get_libc_version());
947#elif defined(LC_VERSION_MASK)
948 locale_t loc;
949
950 /* Look up FreeBSD collation version. */
951 loc = newlocale(LC_COLLATE_MASK, collcollate, NULL);
952 if (loc)
953 {
954 collversion =
955 pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
956 freelocale(loc);
957 }
958 else
960 (errmsg("could not load locale \"%s\"", collcollate)));
961#elif defined(WIN32)
962 /*
963 * If we are targeting Windows Vista and above, we can ask for a name
964 * given a collation name (earlier versions required a location code
965 * that we don't have).
966 */
967 NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
968 WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
969
970 MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
971 LOCALE_NAME_MAX_LENGTH);
972 if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
973 {
974 /*
975 * GetNLSVersionEx() wants a language tag such as "en-US", not a
976 * locale name like "English_United States.1252". Until those
977 * values can be prevented from entering the system, or 100%
978 * reliably converted to the more useful tag format, tolerate the
979 * resulting error and report that we have no version data.
980 */
981 if (GetLastError() == ERROR_INVALID_PARAMETER)
982 return NULL;
983
985 (errmsg("could not get collation version for locale \"%s\": error code %lu",
986 collcollate,
987 GetLastError())));
988 }
989 collversion = psprintf("%lu.%lu,%lu.%lu",
990 (version.dwNLSVersion >> 8) & 0xFFFF,
991 version.dwNLSVersion & 0xFF,
992 (version.dwDefinedVersion >> 8) & 0xFFFF,
993 version.dwDefinedVersion & 0xFF);
994#endif
995 }
996
997 return collversion;
998}
999
1000/*
1001 * strncoll_libc_win32_utf8
1002 *
1003 * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1004 * invoke wcscoll_l().
1005 *
1006 * An input string length of -1 means that it's NUL-terminated.
1007 */
1008#ifdef WIN32
1009static int
1010strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
1011 ssize_t len2, pg_locale_t locale)
1012{
1013 char sbuf[TEXTBUFLEN];
1014 char *buf = sbuf;
1015 char *a1p,
1016 *a2p;
1017 int a1len;
1018 int a2len;
1019 int r;
1020 int result;
1021
1023
1024 if (len1 == -1)
1025 len1 = strlen(arg1);
1026 if (len2 == -1)
1027 len2 = strlen(arg2);
1028
1029 a1len = len1 * 2 + 2;
1030 a2len = len2 * 2 + 2;
1031
1032 if (a1len + a2len > TEXTBUFLEN)
1033 buf = palloc(a1len + a2len);
1034
1035 a1p = buf;
1036 a2p = buf + a1len;
1037
1038 /* API does not work for zero-length input */
1039 if (len1 == 0)
1040 r = 0;
1041 else
1042 {
1043 r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1044 (LPWSTR) a1p, a1len / 2);
1045 if (!r)
1046 ereport(ERROR,
1047 (errmsg("could not convert string to UTF-16: error code %lu",
1048 GetLastError())));
1049 }
1050 ((LPWSTR) a1p)[r] = 0;
1051
1052 if (len2 == 0)
1053 r = 0;
1054 else
1055 {
1056 r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1057 (LPWSTR) a2p, a2len / 2);
1058 if (!r)
1059 ereport(ERROR,
1060 (errmsg("could not convert string to UTF-16: error code %lu",
1061 GetLastError())));
1062 }
1063 ((LPWSTR) a2p)[r] = 0;
1064
1065 errno = 0;
1066 result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->lt);
1067 if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
1068 ereport(ERROR,
1069 (errmsg("could not compare Unicode strings: %m")));
1070
1071 if (buf != sbuf)
1072 pfree(buf);
1073
1074 return result;
1075}
1076#endif /* WIN32 */
1077
1078/* simple subroutine for reporting errors from newlocale() */
1079void
1080report_newlocale_failure(const char *localename)
1081{
1082 int save_errno;
1083
1084 /*
1085 * Windows doesn't provide any useful error indication from
1086 * _create_locale(), and BSD-derived platforms don't seem to feel they
1087 * need to set errno either (even though POSIX is pretty clear that
1088 * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1089 * is what to report.
1090 */
1091 if (errno == 0)
1092 errno = ENOENT;
1093
1094 /*
1095 * ENOENT means "no such locale", not "no such file", so clarify that
1096 * errno with an errdetail message.
1097 */
1098 save_errno = errno; /* auxiliary funcs might change errno */
1099 ereport(ERROR,
1100 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1101 errmsg("could not create locale \"%s\": %m",
1102 localename),
1103 (save_errno == ENOENT ?
1104 errdetail("The operating system could not find any locale data for the locale name \"%s\".",
1105 localename) : 0)));
1106}
1107
1108/*
1109 * POSIX doesn't define _l-variants of these functions, but several systems
1110 * have them. We provide our own replacements here.
1111 */
1112#ifndef HAVE_MBSTOWCS_L
1113static size_t
1114mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
1115{
1116#ifdef WIN32
1117 return _mbstowcs_l(dest, src, n, loc);
1118#else
1119 size_t result;
1120 locale_t save_locale = uselocale(loc);
1121
1122 result = mbstowcs(dest, src, n);
1123 uselocale(save_locale);
1124 return result;
1125#endif
1126}
1127#endif
1128#ifndef HAVE_WCSTOMBS_L
1129static size_t
1130wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
1131{
1132#ifdef WIN32
1133 return _wcstombs_l(dest, src, n, loc);
1134#else
1135 size_t result;
1136 locale_t save_locale = uselocale(loc);
1137
1138 result = wcstombs(dest, src, n);
1139 uselocale(save_locale);
1140 return result;
1141#endif
1142}
1143#endif
1144
1145/*
1146 * These functions convert from/to libc's wchar_t, *not* pg_wchar.
1147 * Therefore we keep them here rather than with the mbutils code.
1148 */
1149
1150/*
1151 * wchar2char --- convert wide characters to multibyte format
1152 *
1153 * This has the same API as the standard wcstombs_l() function; in particular,
1154 * tolen is the maximum number of bytes to store at *to, and *from must be
1155 * zero-terminated. The output will be zero-terminated iff there is room.
1156 */
1157size_t
1158wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
1159{
1160 size_t result;
1161
1162 if (tolen == 0)
1163 return 0;
1164
1165#ifdef WIN32
1166
1167 /*
1168 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
1169 * for some reason mbstowcs and wcstombs won't do this for us, so we use
1170 * MultiByteToWideChar().
1171 */
1173 {
1174 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
1175 NULL, NULL);
1176 /* A zero return is failure */
1177 if (result <= 0)
1178 result = -1;
1179 else
1180 {
1181 Assert(result <= tolen);
1182 /* Microsoft counts the zero terminator in the result */
1183 result--;
1184 }
1185 }
1186 else
1187#endif /* WIN32 */
1188 if (loc == (locale_t) 0)
1189 {
1190 /* Use wcstombs directly for the default locale */
1191 result = wcstombs(to, from, tolen);
1192 }
1193 else
1194 {
1195 /* Use wcstombs_l for nondefault locales */
1196 result = wcstombs_l(to, from, tolen, loc);
1197 }
1198
1199 return result;
1200}
1201
1202/*
1203 * char2wchar --- convert multibyte characters to wide characters
1204 *
1205 * This has almost the API of mbstowcs_l(), except that *from need not be
1206 * null-terminated; instead, the number of input bytes is specified as
1207 * fromlen. Also, we ereport() rather than returning -1 for invalid
1208 * input encoding. tolen is the maximum number of wchar_t's to store at *to.
1209 * The output will be zero-terminated iff there is room.
1210 */
1211static size_t
1212char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
1213 locale_t loc)
1214{
1215 size_t result;
1216
1217 if (tolen == 0)
1218 return 0;
1219
1220#ifdef WIN32
1221 /* See WIN32 "Unicode" comment above */
1223 {
1224 /* Win32 API does not work for zero-length input */
1225 if (fromlen == 0)
1226 result = 0;
1227 else
1228 {
1229 result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
1230 /* A zero return is failure */
1231 if (result == 0)
1232 result = -1;
1233 }
1234
1235 if (result != -1)
1236 {
1237 Assert(result < tolen);
1238 /* Append trailing null wchar (MultiByteToWideChar() does not) */
1239 to[result] = 0;
1240 }
1241 }
1242 else
1243#endif /* WIN32 */
1244 {
1245 /* mbstowcs requires ending '\0' */
1246 char *str = pnstrdup(from, fromlen);
1247
1248 if (loc == (locale_t) 0)
1249 {
1250 /* Use mbstowcs directly for the default locale */
1251 result = mbstowcs(to, str, tolen);
1252 }
1253 else
1254 {
1255 /* Use mbstowcs_l for nondefault locales */
1256 result = mbstowcs_l(to, str, tolen, loc);
1257 }
1258
1259 pfree(str);
1260 }
1261
1262 if (result == -1)
1263 {
1264 /*
1265 * Invalid multibyte character encountered. We try to give a useful
1266 * error message by letting pg_verifymbstr check the string. But it's
1267 * possible that the string is OK to us, and not OK to mbstowcs ---
1268 * this suggests that the LC_CTYPE locale is different from the
1269 * database encoding. Give a generic error message if pg_verifymbstr
1270 * can't find anything wrong.
1271 */
1272 pg_verifymbstr(from, fromlen, false); /* might not return */
1273 /* but if it does ... */
1274 ereport(ERROR,
1275 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1276 errmsg("invalid multibyte character for locale"),
1277 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
1278 }
1279
1280 return result;
1281}
#define TextDatumGetCString(d)
Definition: builtins.h:98
#define IS_HIGHBIT_SET(ch)
Definition: c.h:1155
Oid collid
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
Oid MyDatabaseId
Definition: globals.c:94
Assert(PointerIsAligned(start, uint64))
const char * str
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define bufsize
Definition: indent_globs.h:36
static char * locale
Definition: initdb.c:140
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
unsigned int pg_wchar
Definition: mbprint.c:31
int GetDatabaseEncoding(void)
Definition: mbutils.c:1262
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition: mbutils.c:1557
int pg_database_encoding_max_length(void)
Definition: mbutils.c:1547
void * MemoryContextAllocZero(MemoryContext context, Size size)
Definition: mcxt.c:1263
char * pstrdup(const char *in)
Definition: mcxt.c:1759
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
char * pnstrdup(const char *in, Size len)
Definition: mcxt.c:1770
static bool wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_other_mb
static const struct ctype_methods ctype_methods_libc_utf8
static pg_wchar toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context)
size_t wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc)
static bool wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
static pg_wchar toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc)
static bool wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
char * get_collation_actual_version_libc(const char *collcollate)
static bool wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
static locale_t make_libc_collator(const char *collate, const char *ctype)
static bool wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
static pg_wchar tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
static size_t wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
static const struct collate_methods collate_methods_libc
static bool wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
static const struct ctype_methods ctype_methods_libc_sb
static size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
void report_newlocale_failure(const char *localename)
static pg_wchar tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static char char_tolower_libc(unsigned char ch, pg_locale_t locale)
static bool char_is_cased_libc(char ch, pg_locale_t locale)
static bool wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
#define TEXTBUFLEN
static size_t strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static bool wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
static size_t mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
static size_t strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
static char * buf
Definition: pg_test_fsync.c:72
@ PG_UTF8
Definition: pg_wchar.h:232
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
unsigned char pg_toupper(unsigned char ch)
Definition: pgstrcasecmp.c:105
unsigned char pg_tolower(unsigned char ch)
Definition: pgstrcasecmp.c:122
unsigned char pg_ascii_tolower(unsigned char ch)
Definition: pgstrcasecmp.c:146
unsigned char pg_ascii_toupper(unsigned char ch)
Definition: pgstrcasecmp.c:135
int pg_strncasecmp(const char *s1, const char *s2, size_t n)
Definition: pgstrcasecmp.c:69
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
unsigned int Oid
Definition: postgres_ext.h:32
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int(* strncoll)(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale)
Definition: pg_locale.h:61
size_t(* strlower)(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale)
Definition: pg_locale.h:87
const struct ctype_methods * ctype
Definition: pg_locale.h:153
const struct collate_methods * collate
Definition: pg_locale.h:152
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
Datum SysCacheGetAttrNotNull(int cacheId, HeapTuple tup, AttrNumber attributeNumber)
Definition: syscache.c:625
#define locale_t
Definition: win32_port.h:432
#define toupper_l
Definition: win32_port.h:434
#define iswalnum_l
Definition: win32_port.h:442
#define isgraph_l
Definition: win32_port.h:447
#define towupper_l
Definition: win32_port.h:436
#define ispunct_l
Definition: win32_port.h:451
#define isalpha_l
Definition: win32_port.h:439
#define strcoll_l
Definition: win32_port.h:455
#define iswgraph_l
Definition: win32_port.h:448
#define strxfrm_l
Definition: win32_port.h:456
#define towlower_l
Definition: win32_port.h:435
#define iswspace_l
Definition: win32_port.h:454
#define isdigit_l
Definition: win32_port.h:437
#define wcscoll_l
Definition: win32_port.h:457
#define tolower_l
Definition: win32_port.h:433
#define iswupper_l
Definition: win32_port.h:444
#define iswalpha_l
Definition: win32_port.h:440
#define isprint_l
Definition: win32_port.h:449
#define iswprint_l
Definition: win32_port.h:450
#define isupper_l
Definition: win32_port.h:443
#define isalnum_l
Definition: win32_port.h:441
#define islower_l
Definition: win32_port.h:445
#define iswlower_l
Definition: win32_port.h:446
#define iswpunct_l
Definition: win32_port.h:452
#define isspace_l
Definition: win32_port.h:453
#define iswdigit_l
Definition: win32_port.h:438