PostgreSQL Source Code git master
mvdistinct.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * mvdistinct.c
4 * POSTGRES multivariate ndistinct coefficients
5 *
6 * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 * is tricky, and the estimation error is often significant.
8
9 * The multivariate ndistinct coefficients address this by storing ndistinct
10 * estimates for combinations of the user-specified columns. So for example
11 * given a statistics object on three columns (a,b,c), this module estimates
12 * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 * estimates are already available in pg_statistic.
14 *
15 *
16 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/statistics/mvdistinct.c
21 *
22 *-------------------------------------------------------------------------
23 */
24#include "postgres.h"
25
26#include <math.h>
27
31#include "utils/syscache.h"
32#include "utils/typcache.h"
33#include "varatt.h"
34
35static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
36 int k, int *combination);
37static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
38static int n_choose_k(int n, int k);
39static int num_combinations(int n);
40
41/* size of the struct header fields (magic, type, nitems) */
42#define SizeOfHeader (3 * sizeof(uint32))
43
44/* size of a serialized ndistinct item (coefficient, natts, atts) */
45#define SizeOfItem(natts) \
46 (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
47
48/* minimal size of a ndistinct item (with two attributes) */
49#define MinSizeOfItem SizeOfItem(2)
50
51/* minimal size of mvndistinct, when all items are minimal */
52#define MinSizeOfItems(nitems) \
53 (SizeOfHeader + (nitems) * MinSizeOfItem)
54
55/* Combination generator API */
56
57/* internal state for generator of k-combinations of n elements */
59{
60 int k; /* size of the combination */
61 int n; /* total number of elements */
62 int current; /* index of the next combination to return */
63 int ncombinations; /* number of combinations (size of array) */
64 int *combinations; /* array of pre-built combinations */
66
67static CombinationGenerator *generator_init(int n, int k);
71
72
73/*
74 * statext_ndistinct_build
75 * Compute ndistinct coefficient for the combination of attributes.
76 *
77 * This computes the ndistinct estimate using the same estimator used
78 * in analyze.c and then computes the coefficient.
79 *
80 * To handle expressions easily, we treat them as system attributes with
81 * negative attnums, and offset everything by number of expressions to
82 * allow using Bitmapsets.
83 */
86{
87 MVNDistinct *result;
88 int k;
89 int itemcnt;
90 int numattrs = data->nattnums;
91 int numcombs = num_combinations(numattrs);
92
93 result = palloc(offsetof(MVNDistinct, items) +
94 numcombs * sizeof(MVNDistinctItem));
97 result->nitems = numcombs;
98
99 itemcnt = 0;
100 for (k = 2; k <= numattrs; k++)
101 {
102 int *combination;
104
105 /* generate combinations of K out of N elements */
106 generator = generator_init(numattrs, k);
107
108 while ((combination = generator_next(generator)))
109 {
110 MVNDistinctItem *item = &result->items[itemcnt];
111 int j;
112
113 item->attributes = palloc(sizeof(AttrNumber) * k);
114 item->nattributes = k;
115
116 /* translate the indexes to attnums */
117 for (j = 0; j < k; j++)
118 {
119 item->attributes[j] = data->attnums[combination[j]];
120
122 }
123
124 item->ndistinct =
125 ndistinct_for_combination(totalrows, data, k, combination);
126
127 itemcnt++;
128 Assert(itemcnt <= result->nitems);
129 }
130
132 }
133
134 /* must consume exactly the whole output array */
135 Assert(itemcnt == result->nitems);
136
137 return result;
138}
139
140/*
141 * statext_ndistinct_load
142 * Load the ndistinct value for the indicated pg_statistic_ext tuple
143 */
146{
147 MVNDistinct *result;
148 bool isnull;
149 Datum ndist;
150 HeapTuple htup;
151
152 htup = SearchSysCache2(STATEXTDATASTXOID,
153 ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
154 if (!HeapTupleIsValid(htup))
155 elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
156
157 ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
158 Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
159 if (isnull)
160 elog(ERROR,
161 "requested statistics kind \"%c\" is not yet built for statistics object %u",
162 STATS_EXT_NDISTINCT, mvoid);
163
165
166 ReleaseSysCache(htup);
167
168 return result;
169}
170
171/*
172 * statext_ndistinct_serialize
173 * serialize ndistinct to the on-disk bytea format
174 */
175bytea *
177{
178 int i;
179 bytea *output;
180 char *tmp;
181 Size len;
182
183 Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
185
186 /*
187 * Base size is size of scalar fields in the struct, plus one base struct
188 * for each item, including number of items for each.
189 */
191
192 /* and also include space for the actual attribute numbers */
193 for (i = 0; i < ndistinct->nitems; i++)
194 {
195 int nmembers;
196
197 nmembers = ndistinct->items[i].nattributes;
198 Assert(nmembers >= 2);
199
200 len += SizeOfItem(nmembers);
201 }
202
203 output = (bytea *) palloc(len);
205
206 tmp = VARDATA(output);
207
208 /* Store the base struct values (magic, type, nitems) */
209 memcpy(tmp, &ndistinct->magic, sizeof(uint32));
210 tmp += sizeof(uint32);
211 memcpy(tmp, &ndistinct->type, sizeof(uint32));
212 tmp += sizeof(uint32);
213 memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
214 tmp += sizeof(uint32);
215
216 /*
217 * store number of attributes and attribute numbers for each entry
218 */
219 for (i = 0; i < ndistinct->nitems; i++)
220 {
221 MVNDistinctItem item = ndistinct->items[i];
222 int nmembers = item.nattributes;
223
224 memcpy(tmp, &item.ndistinct, sizeof(double));
225 tmp += sizeof(double);
226 memcpy(tmp, &nmembers, sizeof(int));
227 tmp += sizeof(int);
228
229 memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
230 tmp += nmembers * sizeof(AttrNumber);
231
232 /* protect against overflows */
233 Assert(tmp <= ((char *) output + len));
234 }
235
236 /* check we used exactly the expected space */
237 Assert(tmp == ((char *) output + len));
238
239 return output;
240}
241
242/*
243 * statext_ndistinct_deserialize
244 * Read an on-disk bytea format MVNDistinct to in-memory format
245 */
248{
249 int i;
250 Size minimum_size;
251 MVNDistinct ndist;
252 MVNDistinct *ndistinct;
253 char *tmp;
254
255 if (data == NULL)
256 return NULL;
257
258 /* we expect at least the basic fields of MVNDistinct struct */
260 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
262
263 /* initialize pointer to the data part (skip the varlena header) */
264 tmp = VARDATA_ANY(data);
265
266 /* read the header fields and perform basic sanity checks */
267 memcpy(&ndist.magic, tmp, sizeof(uint32));
268 tmp += sizeof(uint32);
269 memcpy(&ndist.type, tmp, sizeof(uint32));
270 tmp += sizeof(uint32);
271 memcpy(&ndist.nitems, tmp, sizeof(uint32));
272 tmp += sizeof(uint32);
273
274 if (ndist.magic != STATS_NDISTINCT_MAGIC)
275 elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
277 if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
278 elog(ERROR, "invalid ndistinct type %d (expected %d)",
280 if (ndist.nitems == 0)
281 elog(ERROR, "invalid zero-length item array in MVNDistinct");
282
283 /* what minimum bytea size do we expect for those parameters */
284 minimum_size = MinSizeOfItems(ndist.nitems);
285 if (VARSIZE_ANY_EXHDR(data) < minimum_size)
286 elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)",
287 VARSIZE_ANY_EXHDR(data), minimum_size);
288
289 /*
290 * Allocate space for the ndistinct items (no space for each item's
291 * attnos: those live in bitmapsets allocated separately)
292 */
293 ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
294 (ndist.nitems * sizeof(MVNDistinctItem)));
295 ndistinct->magic = ndist.magic;
296 ndistinct->type = ndist.type;
297 ndistinct->nitems = ndist.nitems;
298
299 for (i = 0; i < ndistinct->nitems; i++)
300 {
301 MVNDistinctItem *item = &ndistinct->items[i];
302
303 /* ndistinct value */
304 memcpy(&item->ndistinct, tmp, sizeof(double));
305 tmp += sizeof(double);
306
307 /* number of attributes */
308 memcpy(&item->nattributes, tmp, sizeof(int));
309 tmp += sizeof(int);
310 Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
311
312 item->attributes
313 = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
314
315 memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
316 tmp += sizeof(AttrNumber) * item->nattributes;
317
318 /* still within the bytea */
319 Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
320 }
321
322 /* we should have consumed the whole bytea exactly */
323 Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
324
325 return ndistinct;
326}
327
328/*
329 * ndistinct_for_combination
330 * Estimates number of distinct values in a combination of columns.
331 *
332 * This uses the same ndistinct estimator as compute_scalar_stats() in
333 * ANALYZE, i.e.,
334 * n*d / (n - f1 + f1*n/N)
335 *
336 * except that instead of values in a single column we are dealing with
337 * combination of multiple columns.
338 */
339static double
341 int k, int *combination)
342{
343 int i,
344 j;
345 int f1,
346 cnt,
347 d;
348 bool *isnull;
349 Datum *values;
352 int numrows = data->numrows;
353
354 mss = multi_sort_init(k);
355
356 /*
357 * In order to determine the number of distinct elements, create separate
358 * values[]/isnull[] arrays with all the data we have, then sort them
359 * using the specified column combination as dimensions. We could try to
360 * sort in place, but it'd probably be more complex and bug-prone.
361 */
362 items = (SortItem *) palloc(numrows * sizeof(SortItem));
363 values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
364 isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
365
366 for (i = 0; i < numrows; i++)
367 {
368 items[i].values = &values[i * k];
369 items[i].isnull = &isnull[i * k];
370 }
371
372 /*
373 * For each dimension, set up sort-support and fill in the values from the
374 * sample data.
375 *
376 * We use the column data types' default sort operators and collations;
377 * perhaps at some point it'd be worth using column-specific collations?
378 */
379 for (i = 0; i < k; i++)
380 {
381 Oid typid;
384 VacAttrStats *colstat = data->stats[combination[i]];
385
386 typid = colstat->attrtypid;
387 collid = colstat->attrcollid;
388
390 if (type->lt_opr == InvalidOid) /* shouldn't happen */
391 elog(ERROR, "cache lookup failed for ordering operator for type %u",
392 typid);
393
394 /* prepare the sort function for this dimension */
395 multi_sort_add_dimension(mss, i, type->lt_opr, collid);
396
397 /* accumulate all the data for this dimension into the arrays */
398 for (j = 0; j < numrows; j++)
399 {
400 items[j].values[i] = data->values[combination[i]][j];
401 items[j].isnull[i] = data->nulls[combination[i]][j];
402 }
403 }
404
405 /* We can sort the array now ... */
406 qsort_interruptible(items, numrows, sizeof(SortItem),
407 multi_sort_compare, mss);
408
409 /* ... and count the number of distinct combinations */
410
411 f1 = 0;
412 cnt = 1;
413 d = 1;
414 for (i = 1; i < numrows; i++)
415 {
416 if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
417 {
418 if (cnt == 1)
419 f1 += 1;
420
421 d++;
422 cnt = 0;
423 }
424
425 cnt += 1;
426 }
427
428 if (cnt == 1)
429 f1 += 1;
430
431 return estimate_ndistinct(totalrows, numrows, d, f1);
432}
433
434/* The Duj1 estimator (already used in analyze.c). */
435static double
436estimate_ndistinct(double totalrows, int numrows, int d, int f1)
437{
438 double numer,
439 denom,
440 ndistinct;
441
442 numer = (double) numrows * (double) d;
443
444 denom = (double) (numrows - f1) +
445 (double) f1 * (double) numrows / totalrows;
446
447 ndistinct = numer / denom;
448
449 /* Clamp to sane range in case of roundoff error */
450 if (ndistinct < (double) d)
451 ndistinct = (double) d;
452
453 if (ndistinct > totalrows)
454 ndistinct = totalrows;
455
456 return floor(ndistinct + 0.5);
457}
458
459/*
460 * n_choose_k
461 * computes binomial coefficients using an algorithm that is both
462 * efficient and prevents overflows
463 */
464static int
465n_choose_k(int n, int k)
466{
467 int d,
468 r;
469
470 Assert((k > 0) && (n >= k));
471
472 /* use symmetry of the binomial coefficients */
473 k = Min(k, n - k);
474
475 r = 1;
476 for (d = 1; d <= k; ++d)
477 {
478 r *= n--;
479 r /= d;
480 }
481
482 return r;
483}
484
485/*
486 * num_combinations
487 * number of combinations, excluding single-value combinations
488 */
489static int
491{
492 return (1 << n) - (n + 1);
493}
494
495/*
496 * generator_init
497 * initialize the generator of combinations
498 *
499 * The generator produces combinations of K elements in the interval (0..N).
500 * We prebuild all the combinations in this method, which is simpler than
501 * generating them on the fly.
502 */
504generator_init(int n, int k)
505{
507
508 Assert((n >= k) && (k > 0));
509
510 /* allocate the generator state as a single chunk of memory */
512
513 state->ncombinations = n_choose_k(n, k);
514
515 /* pre-allocate space for all combinations */
516 state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
517
518 state->current = 0;
519 state->k = k;
520 state->n = n;
521
522 /* now actually pre-generate all the combinations of K elements */
524
525 /* make sure we got the expected number of combinations */
526 Assert(state->current == state->ncombinations);
527
528 /* reset the number, so we start with the first one */
529 state->current = 0;
530
531 return state;
532}
533
534/*
535 * generator_next
536 * returns the next combination from the prebuilt list
537 *
538 * Returns a combination of K array indexes (0 .. N), as specified to
539 * generator_init), or NULL when there are no more combination.
540 */
541static int *
543{
544 if (state->current == state->ncombinations)
545 return NULL;
546
547 return &state->combinations[state->k * state->current++];
548}
549
550/*
551 * generator_free
552 * free the internal state of the generator
553 *
554 * Releases the generator internal state (pre-built combinations).
555 */
556static void
558{
559 pfree(state->combinations);
560 pfree(state);
561}
562
563/*
564 * generate_combinations_recurse
565 * given a prefix, generate all possible combinations
566 *
567 * Given a prefix (first few elements of the combination), generate following
568 * elements recursively. We generate the combinations in lexicographic order,
569 * which eliminates permutations of the same combination.
570 */
571static void
573 int index, int start, int *current)
574{
575 /* If we haven't filled all the elements, simply recurse. */
576 if (index < state->k)
577 {
578 int i;
579
580 /*
581 * The values have to be in ascending order, so make sure we start
582 * with the value passed by parameter.
583 */
584
585 for (i = start; i < state->n; i++)
586 {
587 current[index] = i;
588 generate_combinations_recurse(state, (index + 1), (i + 1), current);
589 }
590
591 return;
592 }
593 else
594 {
595 /* we got a valid combination, add it to the array */
596 memcpy(&state->combinations[(state->k * state->current)],
597 current, state->k * sizeof(int));
598 state->current++;
599 }
600}
601
602/*
603 * generate_combinations
604 * generate all k-combinations of N elements
605 */
606static void
608{
609 int *current = (int *) palloc0(sizeof(int) * state->k);
610
611 generate_combinations_recurse(state, 0, 0, current);
612
613 pfree(current);
614}
int16 AttrNumber
Definition: attnum.h:21
#define AttributeNumberIsValid(attributeNumber)
Definition: attnum.h:34
static Datum values[MAXATTR]
Definition: bootstrap.c:153
#define Min(x, y)
Definition: c.h:1008
#define MAXALIGN(LEN)
Definition: c.h:815
#define VARHDRSZ
Definition: c.h:702
uint32_t uint32
Definition: c.h:543
size_t Size
Definition: c.h:615
Oid collid
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
int multi_sort_compare(const void *a, const void *b, void *arg)
MultiSortSupport multi_sort_init(int ndims)
void multi_sort_add_dimension(MultiSortSupport mss, int sortdim, Oid oper, Oid collation)
#define DatumGetByteaPP(X)
Definition: fmgr.h:291
Assert(PointerIsAligned(start, uint64))
return str start
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define nitems(x)
Definition: indent.h:31
FILE * output
int j
Definition: isn.c:78
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc0(Size size)
Definition: mcxt.c:1395
void * palloc(Size size)
Definition: mcxt.c:1365
static int n_choose_k(int n, int k)
Definition: mvdistinct.c:465
#define SizeOfHeader
Definition: mvdistinct.c:42
struct CombinationGenerator CombinationGenerator
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1)
Definition: mvdistinct.c:436
static void generate_combinations_recurse(CombinationGenerator *state, int index, int start, int *current)
Definition: mvdistinct.c:572
MVNDistinct * statext_ndistinct_deserialize(bytea *data)
Definition: mvdistinct.c:247
static double ndistinct_for_combination(double totalrows, StatsBuildData *data, int k, int *combination)
Definition: mvdistinct.c:340
bytea * statext_ndistinct_serialize(MVNDistinct *ndistinct)
Definition: mvdistinct.c:176
static void generate_combinations(CombinationGenerator *state)
Definition: mvdistinct.c:607
MVNDistinct * statext_ndistinct_load(Oid mvoid, bool inh)
Definition: mvdistinct.c:145
static int num_combinations(int n)
Definition: mvdistinct.c:490
MVNDistinct * statext_ndistinct_build(double totalrows, StatsBuildData *data)
Definition: mvdistinct.c:85
#define SizeOfItem(natts)
Definition: mvdistinct.c:45
static void generator_free(CombinationGenerator *state)
Definition: mvdistinct.c:557
static CombinationGenerator * generator_init(int n, int k)
Definition: mvdistinct.c:504
#define MinSizeOfItems(nitems)
Definition: mvdistinct.c:52
static int * generator_next(CombinationGenerator *state)
Definition: mvdistinct.c:542
const void size_t len
const void * data
void qsort_interruptible(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
static Datum BoolGetDatum(bool X)
Definition: postgres.h:112
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
uint64_t Datum
Definition: postgres.h:70
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
int f1[ARRAY_SIZE]
Definition: sql-declare.c:113
#define STATS_NDISTINCT_MAGIC
Definition: statistics.h:22
#define STATS_NDISTINCT_TYPE_BASIC
Definition: statistics.h:23
#define STATS_MAX_DIMENSIONS
Definition: statistics.h:19
double ndistinct
Definition: statistics.h:28
AttrNumber * attributes
Definition: statistics.h:30
uint32 nitems
Definition: statistics.h:38
uint32 type
Definition: statistics.h:37
uint32 magic
Definition: statistics.h:36
MVNDistinctItem items[FLEXIBLE_ARRAY_MEMBER]
Definition: statistics.h:39
Oid attrtypid
Definition: vacuum.h:126
Oid attrcollid
Definition: vacuum.h:129
Definition: type.h:96
Definition: regguts.h:323
Definition: c.h:697
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
Definition: syscache.c:595
HeapTuple SearchSysCache2(int cacheId, Datum key1, Datum key2)
Definition: syscache.c:230
static ItemArray items
Definition: test_tidstore.c:48
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition: typcache.c:386
#define TYPECACHE_LT_OPR
Definition: typcache.h:139
static Size VARSIZE_ANY(const void *PTR)
Definition: varatt.h:460
static Size VARSIZE_ANY_EXHDR(const void *PTR)
Definition: varatt.h:472
static char * VARDATA(const void *PTR)
Definition: varatt.h:305
static char * VARDATA_ANY(const void *PTR)
Definition: varatt.h:486
static void SET_VARSIZE(void *PTR, Size len)
Definition: varatt.h:432
const char * type