#include "postgres.h"
#include <math.h>
#include <limits.h>
#include "access/htup_details.h"
#include "access/parallel.h"
#include "catalog/pg_statistic.h"
#include "commands/tablespace.h"
#include "executor/executor.h"
#include "executor/hashjoin.h"
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
#include "miscadmin.h"
#include "port/pg_bitutils.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/wait_event.h"

Include dependency graph for nodeHash.c:

Macros
#define	NTUP_PER_BUCKET 1

Functions
static void	ExecHashIncreaseNumBatches (HashJoinTable hashtable)

static void	ExecHashIncreaseNumBuckets (HashJoinTable hashtable)

static void	ExecParallelHashIncreaseNumBatches (HashJoinTable hashtable)

static void	ExecParallelHashIncreaseNumBuckets (HashJoinTable hashtable)

static void	ExecHashBuildSkewHash (HashState hashstate, HashJoinTable hashtable, Hash node, int mcvsToUse)

static void	ExecHashSkewTableInsert (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue, int bucketNumber)

static void	ExecHashRemoveNextSkewBucket (HashJoinTable hashtable)

static void *	dense_alloc (HashJoinTable hashtable, Size size)

static HashJoinTuple	ExecParallelHashTupleAlloc (HashJoinTable hashtable, size_t size, dsa_pointer *shared)

static void	MultiExecPrivateHash (HashState *node)

static void	MultiExecParallelHash (HashState *node)

static HashJoinTuple	ExecParallelHashFirstTuple (HashJoinTable hashtable, int bucketno)

static HashJoinTuple	ExecParallelHashNextTuple (HashJoinTable hashtable, HashJoinTuple tuple)

static void	ExecParallelHashPushTuple (dsa_pointer_atomic *head, HashJoinTuple tuple, dsa_pointer tuple_shared)

static void	ExecParallelHashJoinSetUpBatches (HashJoinTable hashtable, int nbatch)

static void	ExecParallelHashEnsureBatchAccessors (HashJoinTable hashtable)

static void	ExecParallelHashRepartitionFirst (HashJoinTable hashtable)

static void	ExecParallelHashRepartitionRest (HashJoinTable hashtable)

static HashMemoryChunk	ExecParallelHashPopChunkQueue (HashJoinTable hashtable, dsa_pointer *shared)

static bool	ExecParallelHashTuplePrealloc (HashJoinTable hashtable, int batchno, size_t size)

static void	ExecParallelHashMergeCounters (HashJoinTable hashtable)

static void	ExecParallelHashCloseBatchAccessors (HashJoinTable hashtable)

static TupleTableSlot *	ExecHash (PlanState *pstate)

Node *	MultiExecHash (HashState *node)

HashState *	ExecInitHash (Hash node, EState estate, int eflags)

void	ExecEndHash (HashState *node)

HashJoinTable	ExecHashTableCreate (HashState *state)

void	ExecChooseHashTableSize (double ntuples, int tupwidth, bool useskew, bool try_combined_hash_mem, int parallel_workers, size_t space_allowed, int numbuckets, int numbatches, int num_skew_mcvs)

void	ExecHashTableDestroy (HashJoinTable hashtable)

static bool	ExecHashIncreaseBatchSize (HashJoinTable hashtable)

void	ExecHashTableInsert (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecParallelHashTableInsert (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecParallelHashTableInsertCurrentBatch (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecHashGetBucketAndBatch (HashJoinTable hashtable, uint32 hashvalue, int bucketno, int batchno)

bool	ExecScanHashBucket (HashJoinState hjstate, ExprContext econtext)

bool	ExecParallelScanHashBucket (HashJoinState hjstate, ExprContext econtext)

void	ExecPrepHashTableForUnmatched (HashJoinState *hjstate)

bool	ExecParallelPrepHashTableForUnmatched (HashJoinState *hjstate)

bool	ExecScanHashTableForUnmatched (HashJoinState hjstate, ExprContext econtext)

bool	ExecParallelScanHashTableForUnmatched (HashJoinState hjstate, ExprContext econtext)

void	ExecHashTableReset (HashJoinTable hashtable)

void	ExecHashTableResetMatchFlags (HashJoinTable hashtable)

void	ExecReScanHash (HashState *node)

int	ExecHashGetSkewBucket (HashJoinTable hashtable, uint32 hashvalue)

void	ExecHashEstimate (HashState node, ParallelContext pcxt)

void	ExecHashInitializeDSM (HashState node, ParallelContext pcxt)

void	ExecHashInitializeWorker (HashState node, ParallelWorkerContext pwcxt)

void	ExecShutdownHash (HashState *node)

void	ExecHashRetrieveInstrumentation (HashState *node)

void	ExecHashAccumInstrumentation (HashInstrumentation *instrument, HashJoinTable hashtable)

void	ExecParallelHashTableAlloc (HashJoinTable hashtable, int batchno)

void	ExecHashTableDetachBatch (HashJoinTable hashtable)

void	ExecHashTableDetach (HashJoinTable hashtable)

void	ExecParallelHashTableSetCurrentBatch (HashJoinTable hashtable, int batchno)

size_t	get_hash_memory_limit (void)

Macro Definition Documentation

◆ NTUP_PER_BUCKET

#define NTUP_PER_BUCKET 1

Definition at line 654 of file nodeHash.c.

Function Documentation

◆ dense_alloc()

static void * dense_alloc	(	HashJoinTable	hashtable,
		Size	size
	)

static

Definition at line 2895 of file nodeHash.c.

{
    HashMemoryChunk newChunk;
    char       *ptr;
 
    /* just in case the size is not already aligned properly */
    size = MAXALIGN(size);
 
    /*
     * If tuple size is larger than threshold, allocate a separate chunk.
     */
    if (size > HASH_CHUNK_THRESHOLD)
    {
        /* allocate new chunk and put it at the beginning of the list */
        newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
                                                        HASH_CHUNK_HEADER_SIZE + size);
        newChunk->maxlen = size;
        newChunk->used = size;
        newChunk->ntuples = 1;
 
        /*
         * Add this chunk to the list after the first existing chunk, so that
         * we don't lose the remaining space in the "current" chunk.
         */
        if (hashtable->chunks != NULL)
        {
            newChunk->next = hashtable->chunks->next;
            hashtable->chunks->next.unshared = newChunk;
        }
        else
        {
            newChunk->next.unshared = hashtable->chunks;
            hashtable->chunks = newChunk;
        }
 
        return HASH_CHUNK_DATA(newChunk);
    }
 
    /*
     * See if we have enough space for it in the current chunk (if any). If
     * not, allocate a fresh chunk.
     */
    if ((hashtable->chunks == NULL) ||
        (hashtable->chunks->maxlen - hashtable->chunks->used) < size)
    {
        /* allocate new chunk and put it at the beginning of the list */
        newChunk = (HashMemoryChunk) MemoryContextAlloc(hashtable->batchCxt,
                                                        HASH_CHUNK_HEADER_SIZE + HASH_CHUNK_SIZE);
 
        newChunk->maxlen = HASH_CHUNK_SIZE;
        newChunk->used = size;
        newChunk->ntuples = 1;
 
        newChunk->next.unshared = hashtable->chunks;
        hashtable->chunks = newChunk;
 
        return HASH_CHUNK_DATA(newChunk);
    }
 
    /* There is enough space in the current chunk, let's add the tuple */
    ptr = HASH_CHUNK_DATA(hashtable->chunks) + hashtable->chunks->used;
    hashtable->chunks->used += size;
    hashtable->chunks->ntuples += 1;
 
    /* return pointer to the start of the tuple memory */
    return ptr;
}

References HashJoinTableData::batchCxt, HashJoinTableData::chunks, HASH_CHUNK_DATA, HASH_CHUNK_HEADER_SIZE, HASH_CHUNK_SIZE, HASH_CHUNK_THRESHOLD, MAXALIGN, HashMemoryChunkData::maxlen, MemoryContextAlloc(), HashMemoryChunkData::next, HashMemoryChunkData::ntuples, HashMemoryChunkData::unshared, and HashMemoryChunkData::used.

Referenced by ExecHashIncreaseNumBatches(), ExecHashRemoveNextSkewBucket(), and ExecHashTableInsert().

◆ ExecChooseHashTableSize()

void ExecChooseHashTableSize	(	double	ntuples,
		int	tupwidth,
		bool	useskew,
		bool	try_combined_hash_mem,
		int	parallel_workers,
		size_t *	space_allowed,
		int *	numbuckets,
		int *	numbatches,
		int *	num_skew_mcvs
	)

Definition at line 657 of file nodeHash.c.

{
    int         tupsize;
    double      inner_rel_bytes;
    size_t      hash_table_bytes;
    size_t      bucket_bytes;
    size_t      max_pointers;
    int         nbatch = 1;
    int         nbuckets;
    double      dbuckets;
 
    /* Force a plausible relation size if no info */
    if (ntuples <= 0.0)
        ntuples = 1000.0;
 
    /*
     * Estimate tupsize based on footprint of tuple in hashtable... note this
     * does not allow for any palloc overhead.  The manipulations of spaceUsed
     * don't count palloc overhead either.
     */
    tupsize = HJTUPLE_OVERHEAD +
        MAXALIGN(SizeofMinimalTupleHeader) +
        MAXALIGN(tupwidth);
    inner_rel_bytes = ntuples * tupsize;
 
    /*
     * Compute in-memory hashtable size limit from GUCs.
     */
    hash_table_bytes = get_hash_memory_limit();
 
    /*
     * Parallel Hash tries to use the combined hash_mem of all workers to
     * avoid the need to batch.  If that won't work, it falls back to hash_mem
     * per worker and tries to process batches in parallel.
     */
    if (try_combined_hash_mem)
    {
        /* Careful, this could overflow size_t */
        double      newlimit;
 
        newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
        newlimit = Min(newlimit, (double) SIZE_MAX);
        hash_table_bytes = (size_t) newlimit;
    }
 
    *space_allowed = hash_table_bytes;
 
    /*
     * If skew optimization is possible, estimate the number of skew buckets
     * that will fit in the memory allowed, and decrement the assumed space
     * available for the main hash table accordingly.
     *
     * We make the optimistic assumption that each skew bucket will contain
     * one inner-relation tuple.  If that turns out to be low, we will recover
     * at runtime by reducing the number of skew buckets.
     *
     * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
     * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
     * will round up to the next power of 2 and then multiply by 4 to reduce
     * collisions.
     */
    if (useskew)
    {
        size_t      bytes_per_mcv;
        size_t      skew_mcvs;
 
        /*----------
         * Compute number of MCVs we could hold in hash_table_bytes
         *
         * Divisor is:
         * size of a hash tuple +
         * worst-case size of skewBucket[] per MCV +
         * size of skewBucketNums[] entry +
         * size of skew bucket struct itself
         *----------
         */
        bytes_per_mcv = tupsize +
            (8 * sizeof(HashSkewBucket *)) +
            sizeof(int) +
            SKEW_BUCKET_OVERHEAD;
        skew_mcvs = hash_table_bytes / bytes_per_mcv;
 
        /*
         * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
         * not to worry about size_t overflow in the multiplication)
         */
        skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
 
        /* Now clamp to integer range */
        skew_mcvs = Min(skew_mcvs, INT_MAX);
 
        *num_skew_mcvs = (int) skew_mcvs;
 
        /* Reduce hash_table_bytes by the amount needed for the skew table */
        if (skew_mcvs > 0)
            hash_table_bytes -= skew_mcvs * bytes_per_mcv;
    }
    else
        *num_skew_mcvs = 0;
 
    /*
     * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
     * memory is filled, assuming a single batch; but limit the value so that
     * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
     * nor MaxAllocSize.
     *
     * Note that both nbuckets and nbatch must be powers of 2 to make
     * ExecHashGetBucketAndBatch fast.
     */
    max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
    max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
    /* If max_pointers isn't a power of 2, must round it down to one */
    max_pointers = pg_prevpower2_size_t(max_pointers);
 
    /* Also ensure we avoid integer overflow in nbatch and nbuckets */
    /* (this step is redundant given the current value of MaxAllocSize) */
    max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
 
    dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
    dbuckets = Min(dbuckets, max_pointers);
    nbuckets = (int) dbuckets;
    /* don't let nbuckets be really small, though ... */
    nbuckets = Max(nbuckets, 1024);
    /* ... and force it to be a power of 2. */
    nbuckets = pg_nextpower2_32(nbuckets);
 
    /*
     * If there's not enough space to store the projected number of tuples and
     * the required bucket headers, we will need multiple batches.
     */
    bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
    if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
    {
        /* We'll need multiple batches */
        size_t      sbuckets;
        double      dbatch;
        int         minbatch;
        size_t      bucket_size;
 
        /*
         * If Parallel Hash with combined hash_mem would still need multiple
         * batches, we'll have to fall back to regular hash_mem budget.
         */
        if (try_combined_hash_mem)
        {
            ExecChooseHashTableSize(ntuples, tupwidth, useskew,
                                    false, parallel_workers,
                                    space_allowed,
                                    numbuckets,
                                    numbatches,
                                    num_skew_mcvs);
            return;
        }
 
        /*
         * Estimate the number of buckets we'll want to have when hash_mem is
         * entirely full.  Each bucket will contain a bucket pointer plus
         * NTUP_PER_BUCKET tuples, whose projected size already includes
         * overhead for the hash code, pointer to the next tuple, etc.
         */
        bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
        if (hash_table_bytes <= bucket_size)
            sbuckets = 1;       /* avoid pg_nextpower2_size_t(0) */
        else
            sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
        sbuckets = Min(sbuckets, max_pointers);
        nbuckets = (int) sbuckets;
        nbuckets = pg_nextpower2_32(nbuckets);
        bucket_bytes = nbuckets * sizeof(HashJoinTuple);
 
        /*
         * Buckets are simple pointers to hashjoin tuples, while tupsize
         * includes the pointer, hash code, and MinimalTupleData.  So buckets
         * should never really exceed 25% of hash_mem (even for
         * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
         * 2^N bytes, where we might get more because of doubling. So let's
         * look for 50% here.
         */
        Assert(bucket_bytes <= hash_table_bytes / 2);
 
        /* Calculate required number of batches. */
        dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
        dbatch = Min(dbatch, max_pointers);
        minbatch = (int) dbatch;
        nbatch = pg_nextpower2_32(Max(2, minbatch));
    }
 
    /*
     * Optimize the total amount of memory consumed by the hash node.
     *
     * The nbatch calculation above focuses on the in-memory hash table,
     * assuming no per-batch overhead. But each batch may have two files, each
     * with a BLCKSZ buffer. For large nbatch values these buffers may use
     * significantly more memory than the hash table.
     *
     * The total memory usage may be expressed by this formula:
     *
     * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ)
     *
     * where (inner_rel_bytes / nbatch) is the size of the in-memory hash
     * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file
     * buffers.
     *
     * The nbatch calculation however ignores the second part. And for very
     * large inner_rel_bytes, there may be no nbatch that keeps total memory
     * usage under the budget (work_mem * hash_mem_multiplier). To deal with
     * that, we will adjust nbatch to minimize total memory consumption across
     * both the hashtable and file buffers.
     *
     * As we increase the size of the hashtable, the number of batches
     * decreases, and the total memory usage follows a U-shaped curve. We find
     * the minimum nbatch by "walking back" -- checking if halving nbatch
     * would lower the total memory usage. We stop when it no longer helps.
     *
     * We only reduce the number of batches. Adding batches reduces memory
     * usage only when most of the memory is used by the hash table, with
     * total memory usage within the limit or not far from it. We don't want
     * to start batching when not needed, even if that would reduce memory
     * usage.
     *
     * While growing the hashtable, we also adjust the number of buckets to
     * maintain a load factor of NTUP_PER_BUCKET while squeezing tuples back
     * from batches into the hashtable.
     *
     * Note that we can only change nbuckets during initial hashtable sizing.
     * Once we start building the hash, nbuckets is fixed (we may still grow
     * the hash table).
     *
     * We double several parameters (space_allowed, nbuckets, num_skew_mcvs),
     * which introduces a risk of overflow. We avoid this by exiting the loop.
     * We could do something smarter (e.g. capping nbuckets and continue), but
     * the complexity is not worth it. Such cases are extremely rare, and this
     * is a best-effort attempt to reduce memory usage.
     */
    while (nbatch > 1)
    {
        /* Check that buckets won't overflow MaxAllocSize */
        if (nbuckets > (MaxAllocSize / sizeof(HashJoinTuple) / 2))
            break;
 
        /* num_skew_mcvs should be less than nbuckets */
        Assert((*num_skew_mcvs) < (INT_MAX / 2));
 
        /*
         * Check that space_allowed won't overflow SIZE_MAX.
         *
         * We don't use hash_table_bytes here, because it does not include the
         * skew buckets. And we want to limit the overall memory limit.
         */
        if ((*space_allowed) > (SIZE_MAX / 2))
            break;
 
        /*
         * Will halving the number of batches and doubling the size of the
         * hashtable reduce overall memory usage?
         *
         * This is the same as (S = space_allowed):
         *
         * (S + 2 * nbatch * BLCKSZ) < (S * 2 + nbatch * BLCKSZ)
         *
         * but avoiding intermediate overflow.
         */
        if (nbatch < (*space_allowed) / BLCKSZ)
            break;
 
        /*
         * MaxAllocSize is sufficiently small that we are not worried about
         * overflowing nbuckets.
         */
        nbuckets *= 2;
 
        *num_skew_mcvs = (*num_skew_mcvs) * 2;
        *space_allowed = (*space_allowed) * 2;
 
        nbatch /= 2;
    }
 
    Assert(nbuckets > 0);
    Assert(nbatch > 0);
 
    *numbuckets = nbuckets;
    *numbatches = nbatch;
}

References Assert(), ExecChooseHashTableSize(), get_hash_memory_limit(), HJTUPLE_OVERHEAD, Max, MAXALIGN, MaxAllocSize, Min, NTUP_PER_BUCKET, pg_nextpower2_32(), pg_nextpower2_size_t, pg_prevpower2_size_t, SizeofMinimalTupleHeader, SKEW_BUCKET_OVERHEAD, and SKEW_HASH_MEM_PERCENT.

Referenced by ExecChooseHashTableSize(), ExecHashTableCreate(), and initial_cost_hashjoin().

◆ ExecEndHash()

void ExecEndHash ( HashState * node )

Definition at line 426 of file nodeHash.c.

{
    PlanState  *outerPlan;
 
    /*
     * shut down the subplan
     */
    outerPlan = outerPlanState(node);
    ExecEndNode(outerPlan);
}

References ExecEndNode(), outerPlan, and outerPlanState.

Referenced by ExecEndNode().

◆ ExecHash()

static TupleTableSlot * ExecHash ( PlanState * pstate )

static

Definition at line 90 of file nodeHash.c.

{
    elog(ERROR, "Hash node does not support ExecProcNode call convention");
    return NULL;
}

References elog, and ERROR.

Referenced by ExecInitHash().

◆ ExecHashAccumInstrumentation()

void ExecHashAccumInstrumentation	(	HashInstrumentation *	instrument,
		HashJoinTable	hashtable
	)

Definition at line 2876 of file nodeHash.c.

{
    instrument->nbuckets = Max(instrument->nbuckets,
                               hashtable->nbuckets);
    instrument->nbuckets_original = Max(instrument->nbuckets_original,
                                        hashtable->nbuckets_original);
    instrument->nbatch = Max(instrument->nbatch,
                             hashtable->nbatch);
    instrument->nbatch_original = Max(instrument->nbatch_original,
                                      hashtable->nbatch_original);
    instrument->space_peak = Max(instrument->space_peak,
                                 hashtable->spacePeak);
}

References Max, HashJoinTableData::nbatch, HashInstrumentation::nbatch, HashJoinTableData::nbatch_original, HashInstrumentation::nbatch_original, HashJoinTableData::nbuckets, HashInstrumentation::nbuckets, HashJoinTableData::nbuckets_original, HashInstrumentation::nbuckets_original, HashInstrumentation::space_peak, and HashJoinTableData::spacePeak.

Referenced by ExecReScanHashJoin(), and ExecShutdownHash().

◆ ExecHashBuildSkewHash()

static void ExecHashBuildSkewHash	(	HashState *	hashstate,
		HashJoinTable	hashtable,
		Hash *	node,
		int	mcvsToUse
	)

static

Definition at line 2402 of file nodeHash.c.

{
    HeapTupleData *statsTuple;
    AttStatsSlot sslot;
 
    /* Do nothing if planner didn't identify the outer relation's join key */
    if (!OidIsValid(node->skewTable))
        return;
    /* Also, do nothing if we don't have room for at least one skew bucket */
    if (mcvsToUse <= 0)
        return;
 
    /*
     * Try to find the MCV statistics for the outer relation's join key.
     */
    statsTuple = SearchSysCache3(STATRELATTINH,
                                 ObjectIdGetDatum(node->skewTable),
                                 Int16GetDatum(node->skewColumn),
                                 BoolGetDatum(node->skewInherit));
    if (!HeapTupleIsValid(statsTuple))
        return;
 
    if (get_attstatsslot(&sslot, statsTuple,
                         STATISTIC_KIND_MCV, InvalidOid,
                         ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
    {
        double      frac;
        int         nbuckets;
        int         i;
 
        if (mcvsToUse > sslot.nvalues)
            mcvsToUse = sslot.nvalues;
 
        /*
         * Calculate the expected fraction of outer relation that will
         * participate in the skew optimization.  If this isn't at least
         * SKEW_MIN_OUTER_FRACTION, don't use skew optimization.
         */
        frac = 0;
        for (i = 0; i < mcvsToUse; i++)
            frac += sslot.numbers[i];
        if (frac < SKEW_MIN_OUTER_FRACTION)
        {
            free_attstatsslot(&sslot);
            ReleaseSysCache(statsTuple);
            return;
        }
 
        /*
         * Okay, set up the skew hashtable.
         *
         * skewBucket[] is an open addressing hashtable with a power of 2 size
         * that is greater than the number of MCV values.  (This ensures there
         * will be at least one null entry, so searches will always
         * terminate.)
         *
         * Note: this code could fail if mcvsToUse exceeds INT_MAX/8 or
         * MaxAllocSize/sizeof(void *)/8, but that is not currently possible
         * since we limit pg_statistic entries to much less than that.
         */
        nbuckets = pg_nextpower2_32(mcvsToUse + 1);
        /* use two more bits just to help avoid collisions */
        nbuckets <<= 2;
 
        hashtable->skewEnabled = true;
        hashtable->skewBucketLen = nbuckets;
 
        /*
         * We allocate the bucket memory in the hashtable's batch context. It
         * is only needed during the first batch, and this ensures it will be
         * automatically removed once the first batch is done.
         */
        hashtable->skewBucket = (HashSkewBucket **)
            MemoryContextAllocZero(hashtable->batchCxt,
                                   nbuckets * sizeof(HashSkewBucket *));
        hashtable->skewBucketNums = (int *)
            MemoryContextAllocZero(hashtable->batchCxt,
                                   mcvsToUse * sizeof(int));
 
        hashtable->spaceUsed += nbuckets * sizeof(HashSkewBucket *)
            + mcvsToUse * sizeof(int);
        hashtable->spaceUsedSkew += nbuckets * sizeof(HashSkewBucket *)
            + mcvsToUse * sizeof(int);
        if (hashtable->spaceUsed > hashtable->spacePeak)
            hashtable->spacePeak = hashtable->spaceUsed;
 
        /*
         * Create a skew bucket for each MCV hash value.
         *
         * Note: it is very important that we create the buckets in order of
         * decreasing MCV frequency.  If we have to remove some buckets, they
         * must be removed in reverse order of creation (see notes in
         * ExecHashRemoveNextSkewBucket) and we want the least common MCVs to
         * be removed first.
         */
 
        for (i = 0; i < mcvsToUse; i++)
        {
            uint32      hashvalue;
            int         bucket;
 
            hashvalue = DatumGetUInt32(FunctionCall1Coll(hashstate->skew_hashfunction,
                                                         hashstate->skew_collation,
                                                         sslot.values[i]));
 
            /*
             * While we have not hit a hole in the hashtable and have not hit
             * the desired bucket, we have collided with some previous hash
             * value, so try the next bucket location.  NB: this code must
             * match ExecHashGetSkewBucket.
             */
            bucket = hashvalue & (nbuckets - 1);
            while (hashtable->skewBucket[bucket] != NULL &&
                   hashtable->skewBucket[bucket]->hashvalue != hashvalue)
                bucket = (bucket + 1) & (nbuckets - 1);
 
            /*
             * If we found an existing bucket with the same hashvalue, leave
             * it alone.  It's okay for two MCVs to share a hashvalue.
             */
            if (hashtable->skewBucket[bucket] != NULL)
                continue;
 
            /* Okay, create a new skew bucket for this hashvalue. */
            hashtable->skewBucket[bucket] = (HashSkewBucket *)
                MemoryContextAlloc(hashtable->batchCxt,
                                   sizeof(HashSkewBucket));
            hashtable->skewBucket[bucket]->hashvalue = hashvalue;
            hashtable->skewBucket[bucket]->tuples = NULL;
            hashtable->skewBucketNums[hashtable->nSkewBuckets] = bucket;
            hashtable->nSkewBuckets++;
            hashtable->spaceUsed += SKEW_BUCKET_OVERHEAD;
            hashtable->spaceUsedSkew += SKEW_BUCKET_OVERHEAD;
            if (hashtable->spaceUsed > hashtable->spacePeak)
                hashtable->spacePeak = hashtable->spaceUsed;
        }
 
        free_attstatsslot(&sslot);
    }
 
    ReleaseSysCache(statsTuple);
}

References ATTSTATSSLOT_NUMBERS, ATTSTATSSLOT_VALUES, HashJoinTableData::batchCxt, BoolGetDatum(), DatumGetUInt32(), free_attstatsslot(), FunctionCall1Coll(), get_attstatsslot(), HashSkewBucket::hashvalue, HeapTupleIsValid, i, if(), Int16GetDatum(), InvalidOid, MemoryContextAlloc(), MemoryContextAllocZero(), HashJoinTableData::nSkewBuckets, AttStatsSlot::numbers, AttStatsSlot::nvalues, ObjectIdGetDatum(), OidIsValid, pg_nextpower2_32(), ReleaseSysCache(), SearchSysCache3(), SKEW_BUCKET_OVERHEAD, HashState::skew_collation, HashState::skew_hashfunction, SKEW_MIN_OUTER_FRACTION, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketLen, HashJoinTableData::skewBucketNums, Hash::skewColumn, HashJoinTableData::skewEnabled, Hash::skewInherit, Hash::skewTable, HashJoinTableData::spacePeak, HashJoinTableData::spaceUsed, HashJoinTableData::spaceUsedSkew, HashSkewBucket::tuples, and AttStatsSlot::values.

Referenced by ExecHashTableCreate().

◆ ExecHashEstimate()

void ExecHashEstimate	(	HashState *	node,
		ParallelContext *	pcxt
	)

Definition at line 2760 of file nodeHash.c.

{
    size_t      size;
 
    /* don't need this if not instrumenting or no workers */
    if (!node->ps.instrument || pcxt->nworkers == 0)
        return;
 
    size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
    size = add_size(size, offsetof(SharedHashInfo, hinstrument));
    shm_toc_estimate_chunk(&pcxt->estimator, size);
    shm_toc_estimate_keys(&pcxt->estimator, 1);
}

References add_size(), ParallelContext::estimator, PlanState::instrument, mul_size(), ParallelContext::nworkers, HashState::ps, shm_toc_estimate_chunk, and shm_toc_estimate_keys.

Referenced by ExecParallelEstimate().

◆ ExecHashGetBucketAndBatch()

void ExecHashGetBucketAndBatch	(	HashJoinTable	hashtable,
		uint32	hashvalue,
		int *	bucketno,
		int *	batchno
	)

Definition at line 1959 of file nodeHash.c.

{
    uint32      nbuckets = (uint32) hashtable->nbuckets;
    uint32      nbatch = (uint32) hashtable->nbatch;
 
    if (nbatch > 1)
    {
        *bucketno = hashvalue & (nbuckets - 1);
        *batchno = pg_rotate_right32(hashvalue,
                                     hashtable->log2_nbuckets) & (nbatch - 1);
    }
    else
    {
        *bucketno = hashvalue & (nbuckets - 1);
        *batchno = 0;
    }
}

References HashJoinTableData::log2_nbuckets, HashJoinTableData::nbatch, HashJoinTableData::nbuckets, and pg_rotate_right32().

Referenced by ExecHashIncreaseNumBatches(), ExecHashIncreaseNumBuckets(), ExecHashJoinImpl(), ExecHashRemoveNextSkewBucket(), ExecHashTableInsert(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinPartitionOuter(), ExecParallelHashRepartitionFirst(), ExecParallelHashRepartitionRest(), ExecParallelHashTableInsert(), and ExecParallelHashTableInsertCurrentBatch().

◆ ExecHashGetSkewBucket()

int ExecHashGetSkewBucket	(	HashJoinTable	hashtable,
		uint32	hashvalue
	)

Definition at line 2554 of file nodeHash.c.

{
    int         bucket;
 
    /*
     * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
     * particular, this happens after the initial batch is done).
     */
    if (!hashtable->skewEnabled)
        return INVALID_SKEW_BUCKET_NO;
 
    /*
     * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
     */
    bucket = hashvalue & (hashtable->skewBucketLen - 1);
 
    /*
     * While we have not hit a hole in the hashtable and have not hit the
     * desired bucket, we have collided with some other hash value, so try the
     * next bucket location.
     */
    while (hashtable->skewBucket[bucket] != NULL &&
           hashtable->skewBucket[bucket]->hashvalue != hashvalue)
        bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
 
    /*
     * Found the desired bucket?
     */
    if (hashtable->skewBucket[bucket] != NULL)
        return bucket;
 
    /*
     * There must not be any hashtable entry for this hash value.
     */
    return INVALID_SKEW_BUCKET_NO;
}

References HashSkewBucket::hashvalue, INVALID_SKEW_BUCKET_NO, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketLen, and HashJoinTableData::skewEnabled.

Referenced by ExecHashJoinImpl(), and MultiExecPrivateHash().

◆ ExecHashIncreaseBatchSize()

static bool ExecHashIncreaseBatchSize ( HashJoinTable hashtable )

static

Definition at line 997 of file nodeHash.c.

{
    /*
     * How much additional memory would doubling nbatch use? Each batch may
     * require two buffered files (inner/outer), with a BLCKSZ buffer.
     */
    size_t      batchSpace = (hashtable->nbatch * 2 * (size_t) BLCKSZ);
 
    /*
     * Compare the new space needed for doubling nbatch and for enlarging the
     * in-memory hash table. If doubling the hash table needs less memory,
     * just do that. Otherwise, continue with doubling the nbatch.
     *
     * We're either doubling spaceAllowed or batchSpace, so which of those
     * increases the memory usage the least is the same as comparing the
     * values directly.
     */
    if (hashtable->spaceAllowed <= batchSpace)
    {
        hashtable->spaceAllowed *= 2;
        return true;
    }
 
    return false;
}

References HashJoinTableData::nbatch, and HashJoinTableData::spaceAllowed.

Referenced by ExecHashIncreaseNumBatches().

◆ ExecHashIncreaseNumBatches()

static void ExecHashIncreaseNumBatches ( HashJoinTable hashtable )

static

Definition at line 1029 of file nodeHash.c.

{
    int         oldnbatch = hashtable->nbatch;
    int         curbatch = hashtable->curbatch;
    int         nbatch;
    long        ninmemory;
    long        nfreed;
    HashMemoryChunk oldchunks;
 
    /* do nothing if we've decided to shut off growth */
    if (!hashtable->growEnabled)
        return;
 
    /* safety check to avoid overflow */
    if (oldnbatch > Min(INT_MAX / 2, MaxAllocSize / (sizeof(void *) * 2)))
        return;
 
    /* consider increasing size of the in-memory hash table instead */
    if (ExecHashIncreaseBatchSize(hashtable))
        return;
 
    nbatch = oldnbatch * 2;
    Assert(nbatch > 1);
 
#ifdef HJDEBUG
    printf("Hashjoin %p: increasing nbatch to %d because space = %zu\n",
           hashtable, nbatch, hashtable->spaceUsed);
#endif
 
    if (hashtable->innerBatchFile == NULL)
    {
        MemoryContext oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
 
        /* we had no file arrays before */
        hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
        hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
 
        MemoryContextSwitchTo(oldcxt);
 
        /* time to establish the temp tablespaces, too */
        PrepareTempTablespaces();
    }
    else
    {
        /* enlarge arrays and zero out added entries */
        hashtable->innerBatchFile = repalloc0_array(hashtable->innerBatchFile, BufFile *, oldnbatch, nbatch);
        hashtable->outerBatchFile = repalloc0_array(hashtable->outerBatchFile, BufFile *, oldnbatch, nbatch);
    }
 
    hashtable->nbatch = nbatch;
 
    /*
     * Scan through the existing hash table entries and dump out any that are
     * no longer of the current batch.
     */
    ninmemory = nfreed = 0;
 
    /* If know we need to resize nbuckets, we can do it while rebatching. */
    if (hashtable->nbuckets_optimal != hashtable->nbuckets)
    {
        /* we never decrease the number of buckets */
        Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
 
        hashtable->nbuckets = hashtable->nbuckets_optimal;
        hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
 
        hashtable->buckets.unshared =
            repalloc_array(hashtable->buckets.unshared,
                           HashJoinTuple, hashtable->nbuckets);
    }
 
    /*
     * We will scan through the chunks directly, so that we can reset the
     * buckets now and not have to keep track which tuples in the buckets have
     * already been processed. We will free the old chunks as we go.
     */
    memset(hashtable->buckets.unshared, 0,
           sizeof(HashJoinTuple) * hashtable->nbuckets);
    oldchunks = hashtable->chunks;
    hashtable->chunks = NULL;
 
    /* so, let's scan through the old chunks, and all tuples in each chunk */
    while (oldchunks != NULL)
    {
        HashMemoryChunk nextchunk = oldchunks->next.unshared;
 
        /* position within the buffer (up to oldchunks->used) */
        size_t      idx = 0;
 
        /* process all tuples stored in this chunk (and then free it) */
        while (idx < oldchunks->used)
        {
            HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(oldchunks) + idx);
            MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
            int         hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len);
            int         bucketno;
            int         batchno;
 
            ninmemory++;
            ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
                                      &bucketno, &batchno);
 
            if (batchno == curbatch)
            {
                /* keep tuple in memory - copy it into the new chunk */
                HashJoinTuple copyTuple;
 
                copyTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
                memcpy(copyTuple, hashTuple, hashTupleSize);
 
                /* and add it back to the appropriate bucket */
                copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
                hashtable->buckets.unshared[bucketno] = copyTuple;
            }
            else
            {
                /* dump it out */
                Assert(batchno > curbatch);
                ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(hashTuple),
                                      hashTuple->hashvalue,
                                      &hashtable->innerBatchFile[batchno],
                                      hashtable);
 
                hashtable->spaceUsed -= hashTupleSize;
                nfreed++;
            }
 
            /* next tuple in this chunk */
            idx += MAXALIGN(hashTupleSize);
 
            /* allow this loop to be cancellable */
            CHECK_FOR_INTERRUPTS();
        }
 
        /* we're done with this chunk - free it and proceed to the next one */
        pfree(oldchunks);
        oldchunks = nextchunk;
    }
 
#ifdef HJDEBUG
    printf("Hashjoin %p: freed %ld of %ld tuples, space now %zu\n",
           hashtable, nfreed, ninmemory, hashtable->spaceUsed);
#endif
 
    /*
     * If we dumped out either all or none of the tuples in the table, disable
     * further expansion of nbatch.  This situation implies that we have
     * enough tuples of identical hashvalues to overflow spaceAllowed.
     * Increasing nbatch will not fix it since there's no way to subdivide the
     * group any more finely. We have to just gut it out and hope the server
     * has enough RAM.
     */
    if (nfreed == 0 || nfreed == ninmemory)
    {
        hashtable->growEnabled = false;
#ifdef HJDEBUG
        printf("Hashjoin %p: disabling further increase of nbatch\n",
               hashtable);
#endif
    }
}

Referenced by ExecHashSkewTableInsert(), and ExecHashTableInsert().

◆ ExecHashIncreaseNumBuckets()

static void ExecHashIncreaseNumBuckets ( HashJoinTable hashtable )

static

Definition at line 1586 of file nodeHash.c.

{
    HashMemoryChunk chunk;
 
    /* do nothing if not an increase (it's called increase for a reason) */
    if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
        return;
 
#ifdef HJDEBUG
    printf("Hashjoin %p: increasing nbuckets %d => %d\n",
           hashtable, hashtable->nbuckets, hashtable->nbuckets_optimal);
#endif
 
    hashtable->nbuckets = hashtable->nbuckets_optimal;
    hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
 
    Assert(hashtable->nbuckets > 1);
    Assert(hashtable->nbuckets <= (INT_MAX / 2));
    Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
 
    /*
     * Just reallocate the proper number of buckets - we don't need to walk
     * through them - we can walk the dense-allocated chunks (just like in
     * ExecHashIncreaseNumBatches, but without all the copying into new
     * chunks)
     */
    hashtable->buckets.unshared =
        repalloc_array(hashtable->buckets.unshared,
                       HashJoinTuple, hashtable->nbuckets);
 
    memset(hashtable->buckets.unshared, 0,
           hashtable->nbuckets * sizeof(HashJoinTuple));
 
    /* scan through all tuples in all chunks to rebuild the hash table */
    for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next.unshared)
    {
        /* process all tuples stored in this chunk */
        size_t      idx = 0;
 
        while (idx < chunk->used)
        {
            HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
            int         bucketno;
            int         batchno;
 
            ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
                                      &bucketno, &batchno);
 
            /* add the tuple to the proper bucket */
            hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
            hashtable->buckets.unshared[bucketno] = hashTuple;
 
            /* advance index past the tuple */
            idx += MAXALIGN(HJTUPLE_OVERHEAD +
                            HJTUPLE_MINTUPLE(hashTuple)->t_len);
        }
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
}

References Assert(), HashJoinTableData::buckets, CHECK_FOR_INTERRUPTS, HashJoinTableData::chunks, ExecHashGetBucketAndBatch(), HASH_CHUNK_DATA, HashJoinTupleData::hashvalue, HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, idx(), HashJoinTableData::log2_nbuckets, HashJoinTableData::log2_nbuckets_optimal, MAXALIGN, HashJoinTableData::nbuckets, HashJoinTableData::nbuckets_optimal, HashJoinTupleData::next, HashMemoryChunkData::next, printf, repalloc_array, HashJoinTupleData::unshared, HashMemoryChunkData::unshared, and HashJoinTableData::unshared.

Referenced by MultiExecPrivateHash().

◆ ExecHashInitializeDSM()

void ExecHashInitializeDSM	(	HashState *	node,
		ParallelContext *	pcxt
	)

Definition at line 2779 of file nodeHash.c.

{
    size_t      size;
 
    /* don't need this if not instrumenting or no workers */
    if (!node->ps.instrument || pcxt->nworkers == 0)
        return;
 
    size = offsetof(SharedHashInfo, hinstrument) +
        pcxt->nworkers * sizeof(HashInstrumentation);
    node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
 
    /* Each per-worker area must start out as zeroes. */
    memset(node->shared_info, 0, size);
 
    node->shared_info->num_workers = pcxt->nworkers;
    shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
                   node->shared_info);
}

References PlanState::instrument, SharedHashInfo::num_workers, ParallelContext::nworkers, PlanState::plan, Plan::plan_node_id, HashState::ps, HashState::shared_info, shm_toc_allocate(), shm_toc_insert(), and ParallelContext::toc.

Referenced by ExecParallelInitializeDSM().

◆ ExecHashInitializeWorker()

void ExecHashInitializeWorker	(	HashState *	node,
		ParallelWorkerContext *	pwcxt
	)

Definition at line 2804 of file nodeHash.c.

{
    SharedHashInfo *shared_info;
 
    /* don't need this if not instrumenting */
    if (!node->ps.instrument)
        return;
 
    /*
     * Find our entry in the shared area, and set up a pointer to it so that
     * we'll accumulate stats there when shutting down or rebuilding the hash
     * table.
     */
    shared_info = (SharedHashInfo *)
        shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
    node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
}

References SharedHashInfo::hinstrument, HashState::hinstrument, PlanState::instrument, ParallelWorkerNumber, PlanState::plan, Plan::plan_node_id, HashState::ps, shm_toc_lookup(), and ParallelWorkerContext::toc.

Referenced by ExecParallelInitializeWorker().

◆ ExecHashRemoveNextSkewBucket()

static void ExecHashRemoveNextSkewBucket ( HashJoinTable hashtable )

static

Definition at line 2646 of file nodeHash.c.

{
    int         bucketToRemove;
    HashSkewBucket *bucket;
    uint32      hashvalue;
    int         bucketno;
    int         batchno;
    HashJoinTuple hashTuple;
 
    /* Locate the bucket to remove */
    bucketToRemove = hashtable->skewBucketNums[hashtable->nSkewBuckets - 1];
    bucket = hashtable->skewBucket[bucketToRemove];
 
    /*
     * Calculate which bucket and batch the tuples belong to in the main
     * hashtable.  They all have the same hash value, so it's the same for all
     * of them.  Also note that it's not possible for nbatch to increase while
     * we are processing the tuples.
     */
    hashvalue = bucket->hashvalue;
    ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
 
    /* Process all tuples in the bucket */
    hashTuple = bucket->tuples;
    while (hashTuple != NULL)
    {
        HashJoinTuple nextHashTuple = hashTuple->next.unshared;
        MinimalTuple tuple;
        Size        tupleSize;
 
        /*
         * This code must agree with ExecHashTableInsert.  We do not use
         * ExecHashTableInsert directly as ExecHashTableInsert expects a
         * TupleTableSlot while we already have HashJoinTuples.
         */
        tuple = HJTUPLE_MINTUPLE(hashTuple);
        tupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 
        /* Decide whether to put the tuple in the hash table or a temp file */
        if (batchno == hashtable->curbatch)
        {
            /* Move the tuple to the main hash table */
            HashJoinTuple copyTuple;
 
            /*
             * We must copy the tuple into the dense storage, else it will not
             * be found by, eg, ExecHashIncreaseNumBatches.
             */
            copyTuple = (HashJoinTuple) dense_alloc(hashtable, tupleSize);
            memcpy(copyTuple, hashTuple, tupleSize);
            pfree(hashTuple);
 
            copyTuple->next.unshared = hashtable->buckets.unshared[bucketno];
            hashtable->buckets.unshared[bucketno] = copyTuple;
 
            /* We have reduced skew space, but overall space doesn't change */
            hashtable->spaceUsedSkew -= tupleSize;
        }
        else
        {
            /* Put the tuple into a temp file for later batches */
            Assert(batchno > hashtable->curbatch);
            ExecHashJoinSaveTuple(tuple, hashvalue,
                                  &hashtable->innerBatchFile[batchno],
                                  hashtable);
            pfree(hashTuple);
            hashtable->spaceUsed -= tupleSize;
            hashtable->spaceUsedSkew -= tupleSize;
        }
 
        hashTuple = nextHashTuple;
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
 
    /*
     * Free the bucket struct itself and reset the hashtable entry to NULL.
     *
     * NOTE: this is not nearly as simple as it looks on the surface, because
     * of the possibility of collisions in the hashtable.  Suppose that hash
     * values A and B collide at a particular hashtable entry, and that A was
     * entered first so B gets shifted to a different table entry.  If we were
     * to remove A first then ExecHashGetSkewBucket would mistakenly start
     * reporting that B is not in the hashtable, because it would hit the NULL
     * before finding B.  However, we always remove entries in the reverse
     * order of creation, so this failure cannot happen.
     */
    hashtable->skewBucket[bucketToRemove] = NULL;
    hashtable->nSkewBuckets--;
    pfree(bucket);
    hashtable->spaceUsed -= SKEW_BUCKET_OVERHEAD;
    hashtable->spaceUsedSkew -= SKEW_BUCKET_OVERHEAD;
 
    /*
     * If we have removed all skew buckets then give up on skew optimization.
     * Release the arrays since they aren't useful any more.
     */
    if (hashtable->nSkewBuckets == 0)
    {
        hashtable->skewEnabled = false;
        pfree(hashtable->skewBucket);
        pfree(hashtable->skewBucketNums);
        hashtable->skewBucket = NULL;
        hashtable->skewBucketNums = NULL;
        hashtable->spaceUsed -= hashtable->spaceUsedSkew;
        hashtable->spaceUsedSkew = 0;
    }
}

Referenced by ExecHashSkewTableInsert().

◆ ExecHashRetrieveInstrumentation()

void ExecHashRetrieveInstrumentation ( HashState * node )

Definition at line 2845 of file nodeHash.c.

{
    SharedHashInfo *shared_info = node->shared_info;
    size_t      size;
 
    if (shared_info == NULL)
        return;
 
    /* Replace node->shared_info with a copy in backend-local memory. */
    size = offsetof(SharedHashInfo, hinstrument) +
        shared_info->num_workers * sizeof(HashInstrumentation);
    node->shared_info = palloc(size);
    memcpy(node->shared_info, shared_info, size);
}

References SharedHashInfo::num_workers, palloc(), and HashState::shared_info.

Referenced by ExecParallelRetrieveInstrumentation().

◆ ExecHashSkewTableInsert()

static void ExecHashSkewTableInsert	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue,
		int	bucketNumber
	)

static

Definition at line 2600 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    HashJoinTuple hashTuple;
    int         hashTupleSize;
 
    /* Create the HashJoinTuple */
    hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
    hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
                                                   hashTupleSize);
    hashTuple->hashvalue = hashvalue;
    memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
    HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
    /* Push it onto the front of the skew bucket's list */
    hashTuple->next.unshared = hashtable->skewBucket[bucketNumber]->tuples;
    hashtable->skewBucket[bucketNumber]->tuples = hashTuple;
    Assert(hashTuple != hashTuple->next.unshared);
 
    /* Account for space used, and back off if we've used too much */
    hashtable->spaceUsed += hashTupleSize;
    hashtable->spaceUsedSkew += hashTupleSize;
    if (hashtable->spaceUsed > hashtable->spacePeak)
        hashtable->spacePeak = hashtable->spaceUsed;
    while (hashtable->spaceUsedSkew > hashtable->spaceAllowedSkew)
        ExecHashRemoveNextSkewBucket(hashtable);
 
    /* Check we are not over the total spaceAllowed, either */
    if (hashtable->spaceUsed > hashtable->spaceAllowed)
        ExecHashIncreaseNumBatches(hashtable);
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

References Assert(), HashJoinTableData::batchCxt, ExecFetchSlotMinimalTuple(), ExecHashIncreaseNumBatches(), ExecHashRemoveNextSkewBucket(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, MemoryContextAlloc(), HashJoinTupleData::next, HashJoinTableData::skewBucket, HashJoinTableData::spaceAllowed, HashJoinTableData::spaceAllowedSkew, HashJoinTableData::spacePeak, HashJoinTableData::spaceUsed, HashJoinTableData::spaceUsedSkew, MinimalTupleData::t_len, HashSkewBucket::tuples, and HashJoinTupleData::unshared.

Referenced by MultiExecPrivateHash().

◆ ExecHashTableCreate()

HashJoinTable ExecHashTableCreate ( HashState * state )

Definition at line 445 of file nodeHash.c.

{
    Hash       *node;
    HashJoinTable hashtable;
    Plan       *outerNode;
    size_t      space_allowed;
    int         nbuckets;
    int         nbatch;
    double      rows;
    int         num_skew_mcvs;
    int         log2_nbuckets;
    MemoryContext oldcxt;
 
    /*
     * Get information about the size of the relation to be hashed (it's the
     * "outer" subtree of this node, but the inner relation of the hashjoin).
     * Compute the appropriate size of the hash table.
     */
    node = (Hash *) state->ps.plan;
    outerNode = outerPlan(node);
 
    /*
     * If this is shared hash table with a partial plan, then we can't use
     * outerNode->plan_rows to estimate its size.  We need an estimate of the
     * total number of rows across all copies of the partial plan.
     */
    rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
 
    ExecChooseHashTableSize(rows, outerNode->plan_width,
                            OidIsValid(node->skewTable),
                            state->parallel_state != NULL,
                            state->parallel_state != NULL ?
                            state->parallel_state->nparticipants - 1 : 0,
                            &space_allowed,
                            &nbuckets, &nbatch, &num_skew_mcvs);
 
    /* nbuckets must be a power of 2 */
    log2_nbuckets = pg_ceil_log2_32(nbuckets);
    Assert(nbuckets == (1 << log2_nbuckets));
 
    /*
     * Initialize the hash table control block.
     *
     * The hashtable control block is just palloc'd from the executor's
     * per-query memory context.  Everything else should be kept inside the
     * subsidiary hashCxt, batchCxt or spillCxt.
     */
    hashtable = palloc_object(HashJoinTableData);
    hashtable->nbuckets = nbuckets;
    hashtable->nbuckets_original = nbuckets;
    hashtable->nbuckets_optimal = nbuckets;
    hashtable->log2_nbuckets = log2_nbuckets;
    hashtable->log2_nbuckets_optimal = log2_nbuckets;
    hashtable->buckets.unshared = NULL;
    hashtable->skewEnabled = false;
    hashtable->skewBucket = NULL;
    hashtable->skewBucketLen = 0;
    hashtable->nSkewBuckets = 0;
    hashtable->skewBucketNums = NULL;
    hashtable->nbatch = nbatch;
    hashtable->curbatch = 0;
    hashtable->nbatch_original = nbatch;
    hashtable->nbatch_outstart = nbatch;
    hashtable->growEnabled = true;
    hashtable->totalTuples = 0;
    hashtable->partialTuples = 0;
    hashtable->skewTuples = 0;
    hashtable->innerBatchFile = NULL;
    hashtable->outerBatchFile = NULL;
    hashtable->spaceUsed = 0;
    hashtable->spacePeak = 0;
    hashtable->spaceAllowed = space_allowed;
    hashtable->spaceUsedSkew = 0;
    hashtable->spaceAllowedSkew =
        hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
    hashtable->chunks = NULL;
    hashtable->current_chunk = NULL;
    hashtable->parallel_state = state->parallel_state;
    hashtable->area = state->ps.state->es_query_dsa;
    hashtable->batches = NULL;
 
#ifdef HJDEBUG
    printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
           hashtable, nbatch, nbuckets);
#endif
 
    /*
     * Create temporary memory contexts in which to keep the hashtable working
     * storage.  See notes in executor/hashjoin.h.
     */
    hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
                                               "HashTableContext",
                                               ALLOCSET_DEFAULT_SIZES);
 
    hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
                                                "HashBatchContext",
                                                ALLOCSET_DEFAULT_SIZES);
 
    hashtable->spillCxt = AllocSetContextCreate(hashtable->hashCxt,
                                                "HashSpillContext",
                                                ALLOCSET_DEFAULT_SIZES);
 
    /* Allocate data that will live for the life of the hashjoin */
 
    oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 
    if (nbatch > 1 && hashtable->parallel_state == NULL)
    {
        MemoryContext oldctx;
 
        /*
         * allocate and initialize the file arrays in hashCxt (not needed for
         * parallel case which uses shared tuplestores instead of raw files)
         */
        oldctx = MemoryContextSwitchTo(hashtable->spillCxt);
 
        hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
        hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
 
        MemoryContextSwitchTo(oldctx);
 
        /* The files will not be opened until needed... */
        /* ... but make sure we have temp tablespaces established for them */
        PrepareTempTablespaces();
    }
 
    MemoryContextSwitchTo(oldcxt);
 
    if (hashtable->parallel_state)
    {
        ParallelHashJoinState *pstate = hashtable->parallel_state;
        Barrier    *build_barrier;
 
        /*
         * Attach to the build barrier.  The corresponding detach operation is
         * in ExecHashTableDetach.  Note that we won't attach to the
         * batch_barrier for batch 0 yet.  We'll attach later and start it out
         * in PHJ_BATCH_PROBE phase, because batch 0 is allocated up front and
         * then loaded while hashing (the standard hybrid hash join
         * algorithm), and we'll coordinate that using build_barrier.
         */
        build_barrier = &pstate->build_barrier;
        BarrierAttach(build_barrier);
 
        /*
         * So far we have no idea whether there are any other participants,
         * and if so, what phase they are working on.  The only thing we care
         * about at this point is whether someone has already created the
         * SharedHashJoinBatch objects and the hash table for batch 0.  One
         * backend will be elected to do that now if necessary.
         */
        if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECT &&
            BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT))
        {
            pstate->nbatch = nbatch;
            pstate->space_allowed = space_allowed;
            pstate->growth = PHJ_GROWTH_OK;
 
            /* Set up the shared state for coordinating batches. */
            ExecParallelHashJoinSetUpBatches(hashtable, nbatch);
 
            /*
             * Allocate batch 0's hash table up front so we can load it
             * directly while hashing.
             */
            pstate->nbuckets = nbuckets;
            ExecParallelHashTableAlloc(hashtable, 0);
        }
 
        /*
         * The next Parallel Hash synchronization point is in
         * MultiExecParallelHash(), which will progress it all the way to
         * PHJ_BUILD_RUN.  The caller must not return control from this
         * executor node between now and then.
         */
    }
    else
    {
        /*
         * Prepare context for the first-scan space allocations; allocate the
         * hashbucket array therein, and set each bucket "empty".
         */
        MemoryContextSwitchTo(hashtable->batchCxt);
 
        hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);
 
        /*
         * Set up for skew optimization, if possible and there's a need for
         * more than one batch.  (In a one-batch join, there's no point in
         * it.)
         */
        if (nbatch > 1)
            ExecHashBuildSkewHash(state, hashtable, node, num_skew_mcvs);
 
        MemoryContextSwitchTo(oldcxt);
    }
 
    return hashtable;
}

Referenced by ExecHashJoinImpl().

◆ ExecHashTableDestroy()

void ExecHashTableDestroy ( HashJoinTable hashtable )

Definition at line 955 of file nodeHash.c.

{
    int         i;
 
    /*
     * Make sure all the temp files are closed.  We skip batch 0, since it
     * can't have any temp files (and the arrays might not even exist if
     * nbatch is only 1).  Parallel hash joins don't use these files.
     */
    if (hashtable->innerBatchFile != NULL)
    {
        for (i = 1; i < hashtable->nbatch; i++)
        {
            if (hashtable->innerBatchFile[i])
                BufFileClose(hashtable->innerBatchFile[i]);
            if (hashtable->outerBatchFile[i])
                BufFileClose(hashtable->outerBatchFile[i]);
        }
    }
 
    /* Release working memory (batchCxt is a child, so it goes away too) */
    MemoryContextDelete(hashtable->hashCxt);
 
    /* And drop the control block */
    pfree(hashtable);
}

References BufFileClose(), HashJoinTableData::hashCxt, i, HashJoinTableData::innerBatchFile, MemoryContextDelete(), HashJoinTableData::nbatch, HashJoinTableData::outerBatchFile, and pfree().

Referenced by ExecEndHashJoin(), and ExecReScanHashJoin().

◆ ExecHashTableDetach()

void ExecHashTableDetach ( HashJoinTable hashtable )

Definition at line 3400 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
 
    /*
     * If we're involved in a parallel query, we must either have gotten all
     * the way to PHJ_BUILD_RUN, or joined too late and be in PHJ_BUILD_FREE.
     */
    Assert(!pstate ||
           BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUN);
 
    if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUN)
    {
        int         i;
 
        /* Make sure any temporary files are closed. */
        if (hashtable->batches)
        {
            for (i = 0; i < hashtable->nbatch; ++i)
            {
                sts_end_write(hashtable->batches[i].inner_tuples);
                sts_end_write(hashtable->batches[i].outer_tuples);
                sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
                sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
            }
        }
 
        /* If we're last to detach, clean up shared memory. */
        if (BarrierArriveAndDetach(&pstate->build_barrier))
        {
            /*
             * Late joining processes will see this state and give up
             * immediately.
             */
            Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_FREE);
 
            if (DsaPointerIsValid(pstate->batches))
            {
                dsa_free(hashtable->area, pstate->batches);
                pstate->batches = InvalidDsaPointer;
            }
        }
    }
    hashtable->parallel_state = NULL;
}

References HashJoinTableData::area, Assert(), BarrierArriveAndDetach(), BarrierPhase(), ParallelHashJoinState::batches, HashJoinTableData::batches, ParallelHashJoinState::build_barrier, dsa_free(), DsaPointerIsValid, i, ParallelHashJoinBatchAccessor::inner_tuples, InvalidDsaPointer, HashJoinTableData::nbatch, ParallelHashJoinBatchAccessor::outer_tuples, HashJoinTableData::parallel_state, PHJ_BUILD_FREE, PHJ_BUILD_RUN, sts_end_parallel_scan(), and sts_end_write().

Referenced by ExecHashJoinReInitializeDSM(), and ExecShutdownHashJoin().

◆ ExecHashTableDetachBatch()

void ExecHashTableDetachBatch ( HashJoinTable hashtable )

Definition at line 3308 of file nodeHash.c.

{
    if (hashtable->parallel_state != NULL &&
        hashtable->curbatch >= 0)
    {
        int         curbatch = hashtable->curbatch;
        ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
        bool        attached = true;
 
        /* Make sure any temporary files are closed. */
        sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
        sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
 
        /* After attaching we always get at least to PHJ_BATCH_PROBE. */
        Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE ||
               BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
 
        /*
         * If we're abandoning the PHJ_BATCH_PROBE phase early without having
         * reached the end of it, it means the plan doesn't want any more
         * tuples, and it is happy to abandon any tuples buffered in this
         * process's subplans.  For correctness, we can't allow any process to
         * execute the PHJ_BATCH_SCAN phase, because we will never have the
         * complete set of match bits.  Therefore we skip emitting unmatched
         * tuples in all backends (if this is a full/right join), as if those
         * tuples were all due to be emitted by this process and it has
         * abandoned them too.
         */
        if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE &&
            !hashtable->batches[curbatch].outer_eof)
        {
            /*
             * This flag may be written to by multiple backends during
             * PHJ_BATCH_PROBE phase, but will only be read in PHJ_BATCH_SCAN
             * phase so requires no extra locking.
             */
            batch->skip_unmatched = true;
        }
 
        /*
         * Even if we aren't doing a full/right outer join, we'll step through
         * the PHJ_BATCH_SCAN phase just to maintain the invariant that
         * freeing happens in PHJ_BATCH_FREE, but that'll be wait-free.
         */
        if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE)
            attached = BarrierArriveAndDetachExceptLast(&batch->batch_barrier);
        if (attached && BarrierArriveAndDetach(&batch->batch_barrier))
        {
            /*
             * We are not longer attached to the batch barrier, but we're the
             * process that was chosen to free resources and it's safe to
             * assert the current phase.  The ParallelHashJoinBatch can't go
             * away underneath us while we are attached to the build barrier,
             * making this access safe.
             */
            Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_FREE);
 
            /* Free shared chunks and buckets. */
            while (DsaPointerIsValid(batch->chunks))
            {
                HashMemoryChunk chunk =
                    dsa_get_address(hashtable->area, batch->chunks);
                dsa_pointer next = chunk->next.shared;
 
                dsa_free(hashtable->area, batch->chunks);
                batch->chunks = next;
            }
            if (DsaPointerIsValid(batch->buckets))
            {
                dsa_free(hashtable->area, batch->buckets);
                batch->buckets = InvalidDsaPointer;
            }
        }
 
        /*
         * Track the largest batch we've been attached to.  Though each
         * backend might see a different subset of batches, explain.c will
         * scan the results from all backends to find the largest value.
         */
        hashtable->spacePeak =
            Max(hashtable->spacePeak,
                batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
 
        /* Remember that we are not attached to a batch. */
        hashtable->curbatch = -1;
    }
}

Referenced by ExecHashJoinReInitializeDSM(), ExecParallelHashJoinNewBatch(), ExecParallelPrepHashTableForUnmatched(), and ExecShutdownHashJoin().

◆ ExecHashTableInsert()

void ExecHashTableInsert	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1748 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    int         bucketno;
    int         batchno;
 
    ExecHashGetBucketAndBatch(hashtable, hashvalue,
                              &bucketno, &batchno);
 
    /*
     * decide whether to put the tuple in the hash table or a temp file
     */
    if (batchno == hashtable->curbatch)
    {
        /*
         * put the tuple in hash table
         */
        HashJoinTuple hashTuple;
        int         hashTupleSize;
        double      ntuples = (hashtable->totalTuples - hashtable->skewTuples);
 
        /* Create the HashJoinTuple */
        hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
        hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
 
        hashTuple->hashvalue = hashvalue;
        memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 
        /*
         * We always reset the tuple-matched flag on insertion.  This is okay
         * even when reloading a tuple from a batch file, since the tuple
         * could not possibly have been matched to an outer tuple before it
         * went into the batch file.
         */
        HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
        /* Push it onto the front of the bucket's list */
        hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
        hashtable->buckets.unshared[bucketno] = hashTuple;
 
        /*
         * Increase the (optimal) number of buckets if we just exceeded the
         * NTUP_PER_BUCKET threshold, but only when there's still a single
         * batch.
         */
        if (hashtable->nbatch == 1 &&
            ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
        {
            /* Guard against integer overflow and alloc size overflow */
            if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
                hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
            {
                hashtable->nbuckets_optimal *= 2;
                hashtable->log2_nbuckets_optimal += 1;
            }
        }
 
        /* Account for space used, and back off if we've used too much */
        hashtable->spaceUsed += hashTupleSize;
        if (hashtable->spaceUsed > hashtable->spacePeak)
            hashtable->spacePeak = hashtable->spaceUsed;
        if (hashtable->spaceUsed +
            hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
            > hashtable->spaceAllowed)
            ExecHashIncreaseNumBatches(hashtable);
    }
    else
    {
        /*
         * put the tuple into a temp file for later batches
         */
        Assert(batchno > hashtable->curbatch);
        ExecHashJoinSaveTuple(tuple,
                              hashvalue,
                              &hashtable->innerBatchFile[batchno],
                              hashtable);
    }
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

Referenced by ExecHashJoinNewBatch(), and MultiExecPrivateHash().

◆ ExecHashTableReset()

void ExecHashTableReset ( HashJoinTable hashtable )

Definition at line 2326 of file nodeHash.c.

{
    MemoryContext oldcxt;
    int         nbuckets = hashtable->nbuckets;
 
    /*
     * Release all the hash buckets and tuples acquired in the prior pass, and
     * reinitialize the context for a new pass.
     */
    MemoryContextReset(hashtable->batchCxt);
    oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
 
    /* Reallocate and reinitialize the hash bucket headers. */
    hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);
 
    hashtable->spaceUsed = 0;
 
    MemoryContextSwitchTo(oldcxt);
 
    /* Forget the chunks (the memory was freed by the context reset above). */
    hashtable->chunks = NULL;
}

References HashJoinTableData::batchCxt, HashJoinTableData::buckets, HashJoinTableData::chunks, MemoryContextReset(), MemoryContextSwitchTo(), HashJoinTableData::nbuckets, palloc0_array, HashJoinTableData::spaceUsed, and HashJoinTableData::unshared.

Referenced by ExecHashJoinNewBatch().

◆ ExecHashTableResetMatchFlags()

void ExecHashTableResetMatchFlags ( HashJoinTable hashtable )

Definition at line 2354 of file nodeHash.c.

{
    HashJoinTuple tuple;
    int         i;
 
    /* Reset all flags in the main table ... */
    for (i = 0; i < hashtable->nbuckets; i++)
    {
        for (tuple = hashtable->buckets.unshared[i]; tuple != NULL;
             tuple = tuple->next.unshared)
            HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
    }
 
    /* ... and the same for the skew buckets, if any */
    for (i = 0; i < hashtable->nSkewBuckets; i++)
    {
        int         j = hashtable->skewBucketNums[i];
        HashSkewBucket *skewBucket = hashtable->skewBucket[j];
 
        for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared)
            HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
    }
}

References HashJoinTableData::buckets, HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, i, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecReScanHashJoin().

◆ ExecInitHash()

HashState * ExecInitHash	(	Hash *	node,
		EState *	estate,
		int	eflags
	)

Definition at line 369 of file nodeHash.c.

{
    HashState  *hashstate;
 
    /* check for unsupported flags */
    Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 
    /*
     * create state structure
     */
    hashstate = makeNode(HashState);
    hashstate->ps.plan = (Plan *) node;
    hashstate->ps.state = estate;
    hashstate->ps.ExecProcNode = ExecHash;
    /* delay building hashtable until ExecHashTableCreate() in executor run */
    hashstate->hashtable = NULL;
 
    /*
     * Miscellaneous initialization
     *
     * create expression context for node
     */
    ExecAssignExprContext(estate, &hashstate->ps);
 
    /*
     * initialize child nodes
     */
    outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
 
    /*
     * initialize our result slot and type. No need to build projection
     * because this node doesn't do projections.
     */
    ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple);
    hashstate->ps.ps_ProjInfo = NULL;
 
    Assert(node->plan.qual == NIL);
 
    /*
     * Delay initialization of hash_expr until ExecInitHashJoin().  We cannot
     * build the ExprState here as we don't yet know the join type we're going
     * to be hashing values for and we need to know that before calling
     * ExecBuildHash32Expr as the keep_nulls parameter depends on the join
     * type.
     */
    hashstate->hash_expr = NULL;
 
    return hashstate;
}

References Assert(), EXEC_FLAG_BACKWARD, EXEC_FLAG_MARK, ExecAssignExprContext(), ExecHash(), ExecInitNode(), ExecInitResultTupleSlotTL(), PlanState::ExecProcNode, HashState::hash_expr, HashState::hashtable, makeNode, NIL, outerPlan, outerPlanState, PlanState::plan, Hash::plan, HashState::ps, PlanState::ps_ProjInfo, Plan::qual, PlanState::state, and TTSOpsMinimalTuple.

Referenced by ExecInitNode().

◆ ExecParallelHashCloseBatchAccessors()

static void ExecParallelHashCloseBatchAccessors ( HashJoinTable hashtable )

static

Definition at line 3203 of file nodeHash.c.

{
    int         i;
 
    for (i = 0; i < hashtable->nbatch; ++i)
    {
        /* Make sure no files are left open. */
        sts_end_write(hashtable->batches[i].inner_tuples);
        sts_end_write(hashtable->batches[i].outer_tuples);
        sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
        sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
    }
    pfree(hashtable->batches);
    hashtable->batches = NULL;
}

References HashJoinTableData::batches, i, ParallelHashJoinBatchAccessor::inner_tuples, HashJoinTableData::nbatch, ParallelHashJoinBatchAccessor::outer_tuples, pfree(), sts_end_parallel_scan(), and sts_end_write().

Referenced by ExecParallelHashEnsureBatchAccessors(), and ExecParallelHashIncreaseNumBatches().

◆ ExecParallelHashEnsureBatchAccessors()

static void ExecParallelHashEnsureBatchAccessors ( HashJoinTable hashtable )

static

Definition at line 3224 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    ParallelHashJoinBatch *batches;
    MemoryContext oldcxt;
    int         i;
 
    if (hashtable->batches != NULL)
    {
        if (hashtable->nbatch == pstate->nbatch)
            return;
        ExecParallelHashCloseBatchAccessors(hashtable);
    }
 
    /*
     * We should never see a state where the batch-tracking array is freed,
     * because we should have given up sooner if we join when the build
     * barrier has reached the PHJ_BUILD_FREE phase.
     */
    Assert(DsaPointerIsValid(pstate->batches));
 
    /*
     * Use hash join spill memory context to allocate accessors, including
     * buffers for the temporary files.
     */
    oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
 
    /* Allocate this backend's accessor array. */
    hashtable->nbatch = pstate->nbatch;
    hashtable->batches =
        palloc0_array(ParallelHashJoinBatchAccessor, hashtable->nbatch);
 
    /* Find the base of the pseudo-array of ParallelHashJoinBatch objects. */
    batches = (ParallelHashJoinBatch *)
        dsa_get_address(hashtable->area, pstate->batches);
 
    /* Set up the accessor array and attach to the tuplestores. */
    for (i = 0; i < hashtable->nbatch; ++i)
    {
        ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
        ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
 
        accessor->shared = shared;
        accessor->preallocated = 0;
        accessor->done = false;
        accessor->outer_eof = false;
        accessor->inner_tuples =
            sts_attach(ParallelHashJoinBatchInner(shared),
                       ParallelWorkerNumber + 1,
                       &pstate->fileset);
        accessor->outer_tuples =
            sts_attach(ParallelHashJoinBatchOuter(shared,
                                                  pstate->nparticipants),
                       ParallelWorkerNumber + 1,
                       &pstate->fileset);
    }
 
    MemoryContextSwitchTo(oldcxt);
}

Referenced by ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), and MultiExecParallelHash().

◆ ExecParallelHashFirstTuple()

static HashJoinTuple ExecParallelHashFirstTuple	(	HashJoinTable	hashtable,
		int	bucketno
	)

inlinestatic

Definition at line 3450 of file nodeHash.c.

{
    HashJoinTuple tuple;
    dsa_pointer p;
 
    Assert(hashtable->parallel_state);
    p = dsa_pointer_atomic_read(&hashtable->buckets.shared[bucketno]);
    tuple = (HashJoinTuple) dsa_get_address(hashtable->area, p);
 
    return tuple;
}

References HashJoinTableData::area, Assert(), HashJoinTableData::buckets, dsa_get_address(), dsa_pointer_atomic_read, HashJoinTableData::parallel_state, and HashJoinTableData::shared.

Referenced by ExecParallelScanHashBucket(), and ExecParallelScanHashTableForUnmatched().

◆ ExecParallelHashIncreaseNumBatches()

static void ExecParallelHashIncreaseNumBatches ( HashJoinTable hashtable )

static

Definition at line 1197 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
 
    Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASH_INNER);
 
    /*
     * It's unlikely, but we need to be prepared for new participants to show
     * up while we're in the middle of this operation so we need to switch on
     * barrier phase here.
     */
    switch (PHJ_GROW_BATCHES_PHASE(BarrierPhase(&pstate->grow_batches_barrier)))
    {
        case PHJ_GROW_BATCHES_ELECT:
 
            /*
             * Elect one participant to prepare to grow the number of batches.
             * This involves reallocating or resetting the buckets of batch 0
             * in preparation for all participants to begin repartitioning the
             * tuples.
             */
            if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
                                     WAIT_EVENT_HASH_GROW_BATCHES_ELECT))
            {
                dsa_pointer_atomic *buckets;
                ParallelHashJoinBatch *old_batch0;
                int         new_nbatch;
                int         i;
 
                /* Move the old batch out of the way. */
                old_batch0 = hashtable->batches[0].shared;
                pstate->old_batches = pstate->batches;
                pstate->old_nbatch = hashtable->nbatch;
                pstate->batches = InvalidDsaPointer;
 
                /* Free this backend's old accessors. */
                ExecParallelHashCloseBatchAccessors(hashtable);
 
                /* Figure out how many batches to use. */
                if (hashtable->nbatch == 1)
                {
                    /*
                     * We are going from single-batch to multi-batch.  We need
                     * to switch from one large combined memory budget to the
                     * regular hash_mem budget.
                     */
                    pstate->space_allowed = get_hash_memory_limit();
 
                    /*
                     * The combined hash_mem of all participants wasn't
                     * enough. Therefore one batch per participant would be
                     * approximately equivalent and would probably also be
                     * insufficient.  So try two batches per participant,
                     * rounded up to a power of two.
                     */
                    new_nbatch = pg_nextpower2_32(pstate->nparticipants * 2);
                }
                else
                {
                    /*
                     * We were already multi-batched.  Try doubling the number
                     * of batches.
                     */
                    new_nbatch = hashtable->nbatch * 2;
                }
 
                /* Allocate new larger generation of batches. */
                Assert(hashtable->nbatch == pstate->nbatch);
                ExecParallelHashJoinSetUpBatches(hashtable, new_nbatch);
                Assert(hashtable->nbatch == pstate->nbatch);
 
                /* Replace or recycle batch 0's bucket array. */
                if (pstate->old_nbatch == 1)
                {
                    double      dtuples;
                    double      dbuckets;
                    int         new_nbuckets;
                    uint32      max_buckets;
 
                    /*
                     * We probably also need a smaller bucket array.  How many
                     * tuples do we expect per batch, assuming we have only
                     * half of them so far?  Normally we don't need to change
                     * the bucket array's size, because the size of each batch
                     * stays the same as we add more batches, but in this
                     * special case we move from a large batch to many smaller
                     * batches and it would be wasteful to keep the large
                     * array.
                     */
                    dtuples = (old_batch0->ntuples * 2.0) / new_nbatch;
 
                    /*
                     * We need to calculate the maximum number of buckets to
                     * stay within the MaxAllocSize boundary.  Round the
                     * maximum number to the previous power of 2 given that
                     * later we round the number to the next power of 2.
                     */
                    max_buckets = pg_prevpower2_32((uint32)
                                                   (MaxAllocSize / sizeof(dsa_pointer_atomic)));
                    dbuckets = ceil(dtuples / NTUP_PER_BUCKET);
                    dbuckets = Min(dbuckets, max_buckets);
                    new_nbuckets = (int) dbuckets;
                    new_nbuckets = Max(new_nbuckets, 1024);
                    new_nbuckets = pg_nextpower2_32(new_nbuckets);
                    dsa_free(hashtable->area, old_batch0->buckets);
                    hashtable->batches[0].shared->buckets =
                        dsa_allocate(hashtable->area,
                                     sizeof(dsa_pointer_atomic) * new_nbuckets);
                    buckets = (dsa_pointer_atomic *)
                        dsa_get_address(hashtable->area,
                                        hashtable->batches[0].shared->buckets);
                    for (i = 0; i < new_nbuckets; ++i)
                        dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
                    pstate->nbuckets = new_nbuckets;
                }
                else
                {
                    /* Recycle the existing bucket array. */
                    hashtable->batches[0].shared->buckets = old_batch0->buckets;
                    buckets = (dsa_pointer_atomic *)
                        dsa_get_address(hashtable->area, old_batch0->buckets);
                    for (i = 0; i < hashtable->nbuckets; ++i)
                        dsa_pointer_atomic_write(&buckets[i], InvalidDsaPointer);
                }
 
                /* Move all chunks to the work queue for parallel processing. */
                pstate->chunk_work_queue = old_batch0->chunks;
 
                /* Disable further growth temporarily while we're growing. */
                pstate->growth = PHJ_GROWTH_DISABLED;
            }
            else
            {
                /* All other participants just flush their tuples to disk. */
                ExecParallelHashCloseBatchAccessors(hashtable);
            }
            /* Fall through. */
 
        case PHJ_GROW_BATCHES_REALLOCATE:
            /* Wait for the above to be finished. */
            BarrierArriveAndWait(&pstate->grow_batches_barrier,
                                 WAIT_EVENT_HASH_GROW_BATCHES_REALLOCATE);
            /* Fall through. */
 
        case PHJ_GROW_BATCHES_REPARTITION:
            /* Make sure that we have the current dimensions and buckets. */
            ExecParallelHashEnsureBatchAccessors(hashtable);
            ExecParallelHashTableSetCurrentBatch(hashtable, 0);
            /* Then partition, flush counters. */
            ExecParallelHashRepartitionFirst(hashtable);
            ExecParallelHashRepartitionRest(hashtable);
            ExecParallelHashMergeCounters(hashtable);
            /* Wait for the above to be finished. */
            BarrierArriveAndWait(&pstate->grow_batches_barrier,
                                 WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION);
            /* Fall through. */
 
        case PHJ_GROW_BATCHES_DECIDE:
 
            /*
             * Elect one participant to clean up and decide whether further
             * repartitioning is needed, or should be disabled because it's
             * not helping.
             */
            if (BarrierArriveAndWait(&pstate->grow_batches_barrier,
                                     WAIT_EVENT_HASH_GROW_BATCHES_DECIDE))
            {
                ParallelHashJoinBatch *old_batches;
                bool        space_exhausted = false;
                bool        extreme_skew_detected = false;
 
                /* Make sure that we have the current dimensions and buckets. */
                ExecParallelHashEnsureBatchAccessors(hashtable);
                ExecParallelHashTableSetCurrentBatch(hashtable, 0);
 
                old_batches = dsa_get_address(hashtable->area, pstate->old_batches);
 
                /* Are any of the new generation of batches exhausted? */
                for (int i = 0; i < hashtable->nbatch; ++i)
                {
                    ParallelHashJoinBatch *batch;
                    ParallelHashJoinBatch *old_batch;
                    int         parent;
 
                    batch = hashtable->batches[i].shared;
                    if (batch->space_exhausted ||
                        batch->estimated_size > pstate->space_allowed)
                        space_exhausted = true;
 
                    parent = i % pstate->old_nbatch;
                    old_batch = NthParallelHashJoinBatch(old_batches, parent);
                    if (old_batch->space_exhausted ||
                        batch->estimated_size > pstate->space_allowed)
                    {
                        /*
                         * Did this batch receive ALL of the tuples from its
                         * parent batch?  That would indicate that further
                         * repartitioning isn't going to help (the hash values
                         * are probably all the same).
                         */
                        if (batch->ntuples == hashtable->batches[parent].shared->old_ntuples)
                            extreme_skew_detected = true;
                    }
                }
 
                /* Don't keep growing if it's not helping or we'd overflow. */
                if (extreme_skew_detected || hashtable->nbatch >= INT_MAX / 2)
                    pstate->growth = PHJ_GROWTH_DISABLED;
                else if (space_exhausted)
                    pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
                else
                    pstate->growth = PHJ_GROWTH_OK;
 
                /* Free the old batches in shared memory. */
                dsa_free(hashtable->area, pstate->old_batches);
                pstate->old_batches = InvalidDsaPointer;
            }
            /* Fall through. */
 
        case PHJ_GROW_BATCHES_FINISH:
            /* Wait for the above to complete. */
            BarrierArriveAndWait(&pstate->grow_batches_barrier,
                                 WAIT_EVENT_HASH_GROW_BATCHES_FINISH);
    }
}

Referenced by ExecParallelHashTupleAlloc(), ExecParallelHashTuplePrealloc(), and MultiExecParallelHash().

◆ ExecParallelHashIncreaseNumBuckets()

static void ExecParallelHashIncreaseNumBuckets ( HashJoinTable hashtable )

static

Definition at line 1649 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    int         i;
    HashMemoryChunk chunk;
    dsa_pointer chunk_s;
 
    Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASH_INNER);
 
    /*
     * It's unlikely, but we need to be prepared for new participants to show
     * up while we're in the middle of this operation so we need to switch on
     * barrier phase here.
     */
    switch (PHJ_GROW_BUCKETS_PHASE(BarrierPhase(&pstate->grow_buckets_barrier)))
    {
        case PHJ_GROW_BUCKETS_ELECT:
            /* Elect one participant to prepare to increase nbuckets. */
            if (BarrierArriveAndWait(&pstate->grow_buckets_barrier,
                                     WAIT_EVENT_HASH_GROW_BUCKETS_ELECT))
            {
                size_t      size;
                dsa_pointer_atomic *buckets;
 
                /* Double the size of the bucket array. */
                pstate->nbuckets *= 2;
                size = pstate->nbuckets * sizeof(dsa_pointer_atomic);
                hashtable->batches[0].shared->size += size / 2;
                dsa_free(hashtable->area, hashtable->batches[0].shared->buckets);
                hashtable->batches[0].shared->buckets =
                    dsa_allocate(hashtable->area, size);
                buckets = (dsa_pointer_atomic *)
                    dsa_get_address(hashtable->area,
                                    hashtable->batches[0].shared->buckets);
                for (i = 0; i < pstate->nbuckets; ++i)
                    dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
 
                /* Put the chunk list onto the work queue. */
                pstate->chunk_work_queue = hashtable->batches[0].shared->chunks;
 
                /* Clear the flag. */
                pstate->growth = PHJ_GROWTH_OK;
            }
            /* Fall through. */
 
        case PHJ_GROW_BUCKETS_REALLOCATE:
            /* Wait for the above to complete. */
            BarrierArriveAndWait(&pstate->grow_buckets_barrier,
                                 WAIT_EVENT_HASH_GROW_BUCKETS_REALLOCATE);
            /* Fall through. */
 
        case PHJ_GROW_BUCKETS_REINSERT:
            /* Reinsert all tuples into the hash table. */
            ExecParallelHashEnsureBatchAccessors(hashtable);
            ExecParallelHashTableSetCurrentBatch(hashtable, 0);
            while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_s)))
            {
                size_t      idx = 0;
 
                while (idx < chunk->used)
                {
                    HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
                    dsa_pointer shared = chunk_s + HASH_CHUNK_HEADER_SIZE + idx;
                    int         bucketno;
                    int         batchno;
 
                    ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
                                              &bucketno, &batchno);
                    Assert(batchno == 0);
 
                    /* add the tuple to the proper bucket */
                    ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                                              hashTuple, shared);
 
                    /* advance index past the tuple */
                    idx += MAXALIGN(HJTUPLE_OVERHEAD +
                                    HJTUPLE_MINTUPLE(hashTuple)->t_len);
                }
 
                /* allow this loop to be cancellable */
                CHECK_FOR_INTERRUPTS();
            }
            BarrierArriveAndWait(&pstate->grow_buckets_barrier,
                                 WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT);
    }
}

References HashJoinTableData::area, Assert(), BarrierArriveAndWait(), BarrierPhase(), HashJoinTableData::batches, ParallelHashJoinBatch::buckets, HashJoinTableData::buckets, ParallelHashJoinState::build_barrier, CHECK_FOR_INTERRUPTS, ParallelHashJoinState::chunk_work_queue, ParallelHashJoinBatch::chunks, dsa_allocate, dsa_free(), dsa_get_address(), dsa_pointer_atomic_init, ExecHashGetBucketAndBatch(), ExecParallelHashEnsureBatchAccessors(), ExecParallelHashPopChunkQueue(), ExecParallelHashPushTuple(), ExecParallelHashTableSetCurrentBatch(), ParallelHashJoinState::grow_buckets_barrier, ParallelHashJoinState::growth, HASH_CHUNK_DATA, HASH_CHUNK_HEADER_SIZE, HashJoinTupleData::hashvalue, HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, i, idx(), InvalidDsaPointer, MAXALIGN, ParallelHashJoinState::nbuckets, HashJoinTableData::parallel_state, PHJ_BUILD_HASH_INNER, PHJ_GROW_BUCKETS_ELECT, PHJ_GROW_BUCKETS_PHASE, PHJ_GROW_BUCKETS_REALLOCATE, PHJ_GROW_BUCKETS_REINSERT, PHJ_GROWTH_OK, ParallelHashJoinBatchAccessor::shared, HashJoinTableData::shared, and ParallelHashJoinBatch::size.

Referenced by ExecParallelHashTupleAlloc(), ExecParallelHashTuplePrealloc(), and MultiExecParallelHash().

◆ ExecParallelHashJoinSetUpBatches()

static void ExecParallelHashJoinSetUpBatches	(	HashJoinTable	hashtable,
		int	nbatch
	)

static

Definition at line 3123 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    ParallelHashJoinBatch *batches;
    MemoryContext oldcxt;
    int         i;
 
    Assert(hashtable->batches == NULL);
 
    /* Allocate space. */
    pstate->batches =
        dsa_allocate0(hashtable->area,
                      EstimateParallelHashJoinBatch(hashtable) * nbatch);
    pstate->nbatch = nbatch;
    batches = dsa_get_address(hashtable->area, pstate->batches);
 
    /*
     * Use hash join spill memory context to allocate accessors, including
     * buffers for the temporary files.
     */
    oldcxt = MemoryContextSwitchTo(hashtable->spillCxt);
 
    /* Allocate this backend's accessor array. */
    hashtable->nbatch = nbatch;
    hashtable->batches =
        palloc0_array(ParallelHashJoinBatchAccessor, hashtable->nbatch);
 
    /* Set up the shared state, tuplestores and backend-local accessors. */
    for (i = 0; i < hashtable->nbatch; ++i)
    {
        ParallelHashJoinBatchAccessor *accessor = &hashtable->batches[i];
        ParallelHashJoinBatch *shared = NthParallelHashJoinBatch(batches, i);
        char        name[MAXPGPATH];
 
        /*
         * All members of shared were zero-initialized.  We just need to set
         * up the Barrier.
         */
        BarrierInit(&shared->batch_barrier, 0);
        if (i == 0)
        {
            /* Batch 0 doesn't need to be loaded. */
            BarrierAttach(&shared->batch_barrier);
            while (BarrierPhase(&shared->batch_barrier) < PHJ_BATCH_PROBE)
                BarrierArriveAndWait(&shared->batch_barrier, 0);
            BarrierDetach(&shared->batch_barrier);
        }
 
        /* Initialize accessor state.  All members were zero-initialized. */
        accessor->shared = shared;
 
        /* Initialize the shared tuplestores. */
        snprintf(name, sizeof(name), "i%dof%d", i, hashtable->nbatch);
        accessor->inner_tuples =
            sts_initialize(ParallelHashJoinBatchInner(shared),
                           pstate->nparticipants,
                           ParallelWorkerNumber + 1,
                           sizeof(uint32),
                           SHARED_TUPLESTORE_SINGLE_PASS,
                           &pstate->fileset,
                           name);
        snprintf(name, sizeof(name), "o%dof%d", i, hashtable->nbatch);
        accessor->outer_tuples =
            sts_initialize(ParallelHashJoinBatchOuter(shared,
                                                      pstate->nparticipants),
                           pstate->nparticipants,
                           ParallelWorkerNumber + 1,
                           sizeof(uint32),
                           SHARED_TUPLESTORE_SINGLE_PASS,
                           &pstate->fileset,
                           name);
    }
 
    MemoryContextSwitchTo(oldcxt);
}

References HashJoinTableData::area, Assert(), BarrierArriveAndWait(), BarrierAttach(), BarrierDetach(), BarrierInit(), BarrierPhase(), ParallelHashJoinBatch::batch_barrier, ParallelHashJoinState::batches, HashJoinTableData::batches, dsa_allocate0, dsa_get_address(), EstimateParallelHashJoinBatch, ParallelHashJoinState::fileset, i, ParallelHashJoinBatchAccessor::inner_tuples, MAXPGPATH, MemoryContextSwitchTo(), name, ParallelHashJoinState::nbatch, HashJoinTableData::nbatch, ParallelHashJoinState::nparticipants, NthParallelHashJoinBatch, ParallelHashJoinBatchAccessor::outer_tuples, palloc0_array, HashJoinTableData::parallel_state, ParallelHashJoinBatchInner, ParallelHashJoinBatchOuter, ParallelWorkerNumber, PHJ_BATCH_PROBE, ParallelHashJoinBatchAccessor::shared, SHARED_TUPLESTORE_SINGLE_PASS, snprintf, HashJoinTableData::spillCxt, and sts_initialize().

Referenced by ExecHashTableCreate(), and ExecParallelHashIncreaseNumBatches().

◆ ExecParallelHashMergeCounters()

static void ExecParallelHashMergeCounters ( HashJoinTable hashtable )

static

Definition at line 1556 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    int         i;
 
    LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
    pstate->total_tuples = 0;
    for (i = 0; i < hashtable->nbatch; ++i)
    {
        ParallelHashJoinBatchAccessor *batch = &hashtable->batches[i];
 
        batch->shared->size += batch->size;
        batch->shared->estimated_size += batch->estimated_size;
        batch->shared->ntuples += batch->ntuples;
        batch->shared->old_ntuples += batch->old_ntuples;
        batch->size = 0;
        batch->estimated_size = 0;
        batch->ntuples = 0;
        batch->old_ntuples = 0;
        pstate->total_tuples += batch->shared->ntuples;
    }
    LWLockRelease(&pstate->lock);
}

Referenced by ExecParallelHashIncreaseNumBatches(), and MultiExecParallelHash().

◆ ExecParallelHashNextTuple()

static HashJoinTuple ExecParallelHashNextTuple	(	HashJoinTable	hashtable,
		HashJoinTuple	tuple
	)

inlinestatic

Definition at line 3466 of file nodeHash.c.

{
    HashJoinTuple next;
 
    Assert(hashtable->parallel_state);
    next = (HashJoinTuple) dsa_get_address(hashtable->area, tuple->next.shared);
 
    return next;
}

References HashJoinTableData::area, Assert(), dsa_get_address(), next, HashJoinTupleData::next, HashJoinTableData::parallel_state, and HashJoinTupleData::shared.

Referenced by ExecParallelScanHashBucket(), and ExecParallelScanHashTableForUnmatched().

◆ ExecParallelHashPopChunkQueue()

static HashMemoryChunk ExecParallelHashPopChunkQueue	(	HashJoinTable	hashtable,
		dsa_pointer *	shared
	)

static

Definition at line 3519 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    HashMemoryChunk chunk;
 
    LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
    if (DsaPointerIsValid(pstate->chunk_work_queue))
    {
        *shared = pstate->chunk_work_queue;
        chunk = (HashMemoryChunk)
            dsa_get_address(hashtable->area, *shared);
        pstate->chunk_work_queue = chunk->next.shared;
    }
    else
        chunk = NULL;
    LWLockRelease(&pstate->lock);
 
    return chunk;
}

References HashJoinTableData::area, ParallelHashJoinState::chunk_work_queue, dsa_get_address(), DsaPointerIsValid, ParallelHashJoinState::lock, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), HashMemoryChunkData::next, HashJoinTableData::parallel_state, and HashMemoryChunkData::shared.

Referenced by ExecParallelHashIncreaseNumBuckets(), and ExecParallelHashRepartitionFirst().

◆ ExecParallelHashPushTuple()

static void ExecParallelHashPushTuple	(	dsa_pointer_atomic *	head,
		HashJoinTuple	tuple,
		dsa_pointer	tuple_shared
	)

inlinestatic

Definition at line 3480 of file nodeHash.c.

{
    for (;;)
    {
        tuple->next.shared = dsa_pointer_atomic_read(head);
        if (dsa_pointer_atomic_compare_exchange(head,
                                                &tuple->next.shared,
                                                tuple_shared))
            break;
    }
}

References dsa_pointer_atomic_compare_exchange, dsa_pointer_atomic_read, HashJoinTupleData::next, and HashJoinTupleData::shared.

Referenced by ExecParallelHashIncreaseNumBuckets(), ExecParallelHashRepartitionFirst(), ExecParallelHashTableInsert(), and ExecParallelHashTableInsertCurrentBatch().

◆ ExecParallelHashRepartitionFirst()

static void ExecParallelHashRepartitionFirst ( HashJoinTable hashtable )

static

Definition at line 1429 of file nodeHash.c.

{
    dsa_pointer chunk_shared;
    HashMemoryChunk chunk;
 
    Assert(hashtable->nbatch == hashtable->parallel_state->nbatch);
 
    while ((chunk = ExecParallelHashPopChunkQueue(hashtable, &chunk_shared)))
    {
        size_t      idx = 0;
 
        /* Repartition all tuples in this chunk. */
        while (idx < chunk->used)
        {
            HashJoinTuple hashTuple = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + idx);
            MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple);
            HashJoinTuple copyTuple;
            dsa_pointer shared;
            int         bucketno;
            int         batchno;
 
            ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
                                      &bucketno, &batchno);
 
            Assert(batchno < hashtable->nbatch);
            if (batchno == 0)
            {
                /* It still belongs in batch 0.  Copy to a new chunk. */
                copyTuple =
                    ExecParallelHashTupleAlloc(hashtable,
                                               HJTUPLE_OVERHEAD + tuple->t_len,
                                               &shared);
                copyTuple->hashvalue = hashTuple->hashvalue;
                memcpy(HJTUPLE_MINTUPLE(copyTuple), tuple, tuple->t_len);
                ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                                          copyTuple, shared);
            }
            else
            {
                size_t      tuple_size =
                    MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
 
                /* It belongs in a later batch. */
                hashtable->batches[batchno].estimated_size += tuple_size;
                sts_puttuple(hashtable->batches[batchno].inner_tuples,
                             &hashTuple->hashvalue, tuple);
            }
 
            /* Count this tuple. */
            ++hashtable->batches[0].old_ntuples;
            ++hashtable->batches[batchno].ntuples;
 
            idx += MAXALIGN(HJTUPLE_OVERHEAD +
                            HJTUPLE_MINTUPLE(hashTuple)->t_len);
        }
 
        /* Free this chunk. */
        dsa_free(hashtable->area, chunk_shared);
 
        CHECK_FOR_INTERRUPTS();
    }
}

References HashJoinTableData::area, Assert(), HashJoinTableData::batches, HashJoinTableData::buckets, CHECK_FOR_INTERRUPTS, dsa_free(), ParallelHashJoinBatchAccessor::estimated_size, ExecHashGetBucketAndBatch(), ExecParallelHashPopChunkQueue(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), HASH_CHUNK_DATA, HashJoinTupleData::hashvalue, HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, idx(), ParallelHashJoinBatchAccessor::inner_tuples, MAXALIGN, ParallelHashJoinState::nbatch, HashJoinTableData::nbatch, ParallelHashJoinBatchAccessor::ntuples, ParallelHashJoinBatchAccessor::old_ntuples, HashJoinTableData::parallel_state, HashJoinTableData::shared, sts_puttuple(), and MinimalTupleData::t_len.

Referenced by ExecParallelHashIncreaseNumBatches().

◆ ExecParallelHashRepartitionRest()

static void ExecParallelHashRepartitionRest ( HashJoinTable hashtable )

static

Definition at line 1496 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    int         old_nbatch = pstate->old_nbatch;
    SharedTuplestoreAccessor **old_inner_tuples;
    ParallelHashJoinBatch *old_batches;
    int         i;
 
    /* Get our hands on the previous generation of batches. */
    old_batches = (ParallelHashJoinBatch *)
        dsa_get_address(hashtable->area, pstate->old_batches);
    old_inner_tuples = palloc0_array(SharedTuplestoreAccessor *, old_nbatch);
    for (i = 1; i < old_nbatch; ++i)
    {
        ParallelHashJoinBatch *shared =
            NthParallelHashJoinBatch(old_batches, i);
 
        old_inner_tuples[i] = sts_attach(ParallelHashJoinBatchInner(shared),
                                         ParallelWorkerNumber + 1,
                                         &pstate->fileset);
    }
 
    /* Join in the effort to repartition them. */
    for (i = 1; i < old_nbatch; ++i)
    {
        MinimalTuple tuple;
        uint32      hashvalue;
 
        /* Scan one partition from the previous generation. */
        sts_begin_parallel_scan(old_inner_tuples[i]);
        while ((tuple = sts_parallel_scan_next(old_inner_tuples[i], &hashvalue)))
        {
            size_t      tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
            int         bucketno;
            int         batchno;
 
            /* Decide which partition it goes to in the new generation. */
            ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno,
                                      &batchno);
 
            hashtable->batches[batchno].estimated_size += tuple_size;
            ++hashtable->batches[batchno].ntuples;
            ++hashtable->batches[i].old_ntuples;
 
            /* Store the tuple its new batch. */
            sts_puttuple(hashtable->batches[batchno].inner_tuples,
                         &hashvalue, tuple);
 
            CHECK_FOR_INTERRUPTS();
        }
        sts_end_parallel_scan(old_inner_tuples[i]);
    }
 
    pfree(old_inner_tuples);
}

References HashJoinTableData::area, HashJoinTableData::batches, CHECK_FOR_INTERRUPTS, dsa_get_address(), ParallelHashJoinBatchAccessor::estimated_size, ExecHashGetBucketAndBatch(), ParallelHashJoinState::fileset, HJTUPLE_OVERHEAD, i, ParallelHashJoinBatchAccessor::inner_tuples, MAXALIGN, NthParallelHashJoinBatch, ParallelHashJoinBatchAccessor::ntuples, ParallelHashJoinState::old_batches, ParallelHashJoinState::old_nbatch, ParallelHashJoinBatchAccessor::old_ntuples, palloc0_array, HashJoinTableData::parallel_state, ParallelHashJoinBatchInner, ParallelWorkerNumber, pfree(), sts_attach(), sts_begin_parallel_scan(), sts_end_parallel_scan(), sts_parallel_scan_next(), sts_puttuple(), and MinimalTupleData::t_len.

Referenced by ExecParallelHashIncreaseNumBatches().

◆ ExecParallelHashTableAlloc()

void ExecParallelHashTableAlloc	(	HashJoinTable	hashtable,
		int	batchno
	)

Definition at line 3288 of file nodeHash.c.

{
    ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared;
    dsa_pointer_atomic *buckets;
    int         nbuckets = hashtable->parallel_state->nbuckets;
    int         i;
 
    batch->buckets =
        dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets);
    buckets = (dsa_pointer_atomic *)
        dsa_get_address(hashtable->area, batch->buckets);
    for (i = 0; i < nbuckets; ++i)
        dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
}

References HashJoinTableData::area, HashJoinTableData::batches, ParallelHashJoinBatch::buckets, dsa_allocate, dsa_get_address(), dsa_pointer_atomic_init, i, InvalidDsaPointer, ParallelHashJoinState::nbuckets, HashJoinTableData::parallel_state, and ParallelHashJoinBatchAccessor::shared.

Referenced by ExecHashTableCreate(), and ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableInsert()

void ExecParallelHashTableInsert	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1838 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    dsa_pointer shared;
    int         bucketno;
    int         batchno;
 
retry:
    ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
 
    if (batchno == 0)
    {
        HashJoinTuple hashTuple;
 
        /* Try to load it into memory. */
        Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) ==
               PHJ_BUILD_HASH_INNER);
        hashTuple = ExecParallelHashTupleAlloc(hashtable,
                                               HJTUPLE_OVERHEAD + tuple->t_len,
                                               &shared);
        if (hashTuple == NULL)
            goto retry;
 
        /* Store the hash value in the HashJoinTuple header. */
        hashTuple->hashvalue = hashvalue;
        memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
        HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
        /* Push it onto the front of the bucket's list */
        ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                                  hashTuple, shared);
    }
    else
    {
        size_t      tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
 
        Assert(batchno > 0);
 
        /* Try to preallocate space in the batch if necessary. */
        if (hashtable->batches[batchno].preallocated < tuple_size)
        {
            if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size))
                goto retry;
        }
 
        Assert(hashtable->batches[batchno].preallocated >= tuple_size);
        hashtable->batches[batchno].preallocated -= tuple_size;
        sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,
                     tuple);
    }
    ++hashtable->batches[batchno].ntuples;
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

References Assert(), BarrierPhase(), HashJoinTableData::batches, HashJoinTableData::buckets, ParallelHashJoinState::build_barrier, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), ExecParallelHashTuplePrealloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, ParallelHashJoinBatchAccessor::inner_tuples, MAXALIGN, ParallelHashJoinBatchAccessor::ntuples, HashJoinTableData::parallel_state, PHJ_BUILD_HASH_INNER, ParallelHashJoinBatchAccessor::preallocated, HashJoinTableData::shared, sts_puttuple(), and MinimalTupleData::t_len.

Referenced by MultiExecParallelHash().

◆ ExecParallelHashTableInsertCurrentBatch()

void ExecParallelHashTableInsertCurrentBatch	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1904 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    HashJoinTuple hashTuple;
    dsa_pointer shared;
    int         batchno;
    int         bucketno;
 
    ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
    Assert(batchno == hashtable->curbatch);
    hashTuple = ExecParallelHashTupleAlloc(hashtable,
                                           HJTUPLE_OVERHEAD + tuple->t_len,
                                           &shared);
    hashTuple->hashvalue = hashvalue;
    memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
    HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
    ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                              hashTuple, shared);
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

References Assert(), HashJoinTableData::buckets, HashJoinTableData::curbatch, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, HashJoinTableData::shared, and MinimalTupleData::t_len.

Referenced by ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableSetCurrentBatch()

void ExecParallelHashTableSetCurrentBatch	(	HashJoinTable	hashtable,
		int	batchno
	)

Definition at line 3498 of file nodeHash.c.

{
    Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer);
 
    hashtable->curbatch = batchno;
    hashtable->buckets.shared = (dsa_pointer_atomic *)
        dsa_get_address(hashtable->area,
                        hashtable->batches[batchno].shared->buckets);
    hashtable->nbuckets = hashtable->parallel_state->nbuckets;
    hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);
    hashtable->current_chunk = NULL;
    hashtable->current_chunk_shared = InvalidDsaPointer;
    hashtable->batches[batchno].at_least_one_chunk = false;
}

Referenced by ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinNewBatch(), and MultiExecParallelHash().

◆ ExecParallelHashTupleAlloc()

static HashJoinTuple ExecParallelHashTupleAlloc	(	HashJoinTable	hashtable,
		size_t	size,
		dsa_pointer *	shared
	)

static

Definition at line 2975 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    dsa_pointer chunk_shared;
    HashMemoryChunk chunk;
    Size        chunk_size;
    HashJoinTuple result;
    int         curbatch = hashtable->curbatch;
 
    size = MAXALIGN(size);
 
    /*
     * Fast path: if there is enough space in this backend's current chunk,
     * then we can allocate without any locking.
     */
    chunk = hashtable->current_chunk;
    if (chunk != NULL &&
        size <= HASH_CHUNK_THRESHOLD &&
        chunk->maxlen - chunk->used >= size)
    {
 
        chunk_shared = hashtable->current_chunk_shared;
        Assert(chunk == dsa_get_address(hashtable->area, chunk_shared));
        *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE + chunk->used;
        result = (HashJoinTuple) (HASH_CHUNK_DATA(chunk) + chunk->used);
        chunk->used += size;
 
        Assert(chunk->used <= chunk->maxlen);
        Assert(result == dsa_get_address(hashtable->area, *shared));
 
        return result;
    }
 
    /* Slow path: try to allocate a new chunk. */
    LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
 
    /*
     * Check if we need to help increase the number of buckets or batches.
     */
    if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
        pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
    {
        ParallelHashGrowth growth = pstate->growth;
 
        hashtable->current_chunk = NULL;
        LWLockRelease(&pstate->lock);
 
        /* Another participant has commanded us to help grow. */
        if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
            ExecParallelHashIncreaseNumBatches(hashtable);
        else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
            ExecParallelHashIncreaseNumBuckets(hashtable);
 
        /* The caller must retry. */
        return NULL;
    }
 
    /* Oversized tuples get their own chunk. */
    if (size > HASH_CHUNK_THRESHOLD)
        chunk_size = size + HASH_CHUNK_HEADER_SIZE;
    else
        chunk_size = HASH_CHUNK_SIZE;
 
    /* Check if it's time to grow batches or buckets. */
    if (pstate->growth != PHJ_GROWTH_DISABLED)
    {
        Assert(curbatch == 0);
        Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_HASH_INNER);
 
        /*
         * Check if our space limit would be exceeded.  To avoid choking on
         * very large tuples or very low hash_mem setting, we'll always allow
         * each backend to allocate at least one chunk.
         */
        if (hashtable->batches[0].at_least_one_chunk &&
            hashtable->batches[0].shared->size +
            chunk_size > pstate->space_allowed)
        {
            pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
            hashtable->batches[0].shared->space_exhausted = true;
            LWLockRelease(&pstate->lock);
 
            return NULL;
        }
 
        /* Check if our load factor limit would be exceeded. */
        if (hashtable->nbatch == 1)
        {
            hashtable->batches[0].shared->ntuples += hashtable->batches[0].ntuples;
            hashtable->batches[0].ntuples = 0;
            /* Guard against integer overflow and alloc size overflow */
            if (hashtable->batches[0].shared->ntuples + 1 >
                hashtable->nbuckets * NTUP_PER_BUCKET &&
                hashtable->nbuckets < (INT_MAX / 2) &&
                hashtable->nbuckets * 2 <=
                MaxAllocSize / sizeof(dsa_pointer_atomic))
            {
                pstate->growth = PHJ_GROWTH_NEED_MORE_BUCKETS;
                LWLockRelease(&pstate->lock);
 
                return NULL;
            }
        }
    }
 
    /* We are cleared to allocate a new chunk. */
    chunk_shared = dsa_allocate(hashtable->area, chunk_size);
    hashtable->batches[curbatch].shared->size += chunk_size;
    hashtable->batches[curbatch].at_least_one_chunk = true;
 
    /* Set up the chunk. */
    chunk = (HashMemoryChunk) dsa_get_address(hashtable->area, chunk_shared);
    *shared = chunk_shared + HASH_CHUNK_HEADER_SIZE;
    chunk->maxlen = chunk_size - HASH_CHUNK_HEADER_SIZE;
    chunk->used = size;
 
    /*
     * Push it onto the list of chunks, so that it can be found if we need to
     * increase the number of buckets or batches (batch 0 only) and later for
     * freeing the memory (all batches).
     */
    chunk->next.shared = hashtable->batches[curbatch].shared->chunks;
    hashtable->batches[curbatch].shared->chunks = chunk_shared;
 
    if (size <= HASH_CHUNK_THRESHOLD)
    {
        /*
         * Make this the current chunk so that we can use the fast path to
         * fill the rest of it up in future calls.
         */
        hashtable->current_chunk = chunk;
        hashtable->current_chunk_shared = chunk_shared;
    }
    LWLockRelease(&pstate->lock);
 
    Assert(HASH_CHUNK_DATA(chunk) == dsa_get_address(hashtable->area, *shared));
    result = (HashJoinTuple) HASH_CHUNK_DATA(chunk);
 
    return result;
}

Referenced by ExecParallelHashRepartitionFirst(), ExecParallelHashTableInsert(), and ExecParallelHashTableInsertCurrentBatch().

◆ ExecParallelHashTuplePrealloc()

static bool ExecParallelHashTuplePrealloc	(	HashJoinTable	hashtable,
		int	batchno,
		size_t	size
	)

static

Definition at line 3560 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
    ParallelHashJoinBatchAccessor *batch = &hashtable->batches[batchno];
    size_t      want = Max(size, HASH_CHUNK_SIZE - HASH_CHUNK_HEADER_SIZE);
 
    Assert(batchno > 0);
    Assert(batchno < hashtable->nbatch);
    Assert(size == MAXALIGN(size));
 
    LWLockAcquire(&pstate->lock, LW_EXCLUSIVE);
 
    /* Has another participant commanded us to help grow? */
    if (pstate->growth == PHJ_GROWTH_NEED_MORE_BATCHES ||
        pstate->growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
    {
        ParallelHashGrowth growth = pstate->growth;
 
        LWLockRelease(&pstate->lock);
        if (growth == PHJ_GROWTH_NEED_MORE_BATCHES)
            ExecParallelHashIncreaseNumBatches(hashtable);
        else if (growth == PHJ_GROWTH_NEED_MORE_BUCKETS)
            ExecParallelHashIncreaseNumBuckets(hashtable);
 
        return false;
    }
 
    if (pstate->growth != PHJ_GROWTH_DISABLED &&
        batch->at_least_one_chunk &&
        (batch->shared->estimated_size + want + HASH_CHUNK_HEADER_SIZE
         > pstate->space_allowed))
    {
        /*
         * We have determined that this batch would exceed the space budget if
         * loaded into memory.  Command all participants to help repartition.
         */
        batch->shared->space_exhausted = true;
        pstate->growth = PHJ_GROWTH_NEED_MORE_BATCHES;
        LWLockRelease(&pstate->lock);
 
        return false;
    }
 
    batch->at_least_one_chunk = true;
    batch->shared->estimated_size += want + HASH_CHUNK_HEADER_SIZE;
    batch->preallocated = want;
    LWLockRelease(&pstate->lock);
 
    return true;
}

References Assert(), ParallelHashJoinBatchAccessor::at_least_one_chunk, HashJoinTableData::batches, ParallelHashJoinBatch::estimated_size, ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), ParallelHashJoinState::growth, HASH_CHUNK_HEADER_SIZE, HASH_CHUNK_SIZE, ParallelHashJoinState::lock, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), Max, MAXALIGN, HashJoinTableData::parallel_state, PHJ_GROWTH_DISABLED, PHJ_GROWTH_NEED_MORE_BATCHES, PHJ_GROWTH_NEED_MORE_BUCKETS, ParallelHashJoinBatchAccessor::preallocated, ParallelHashJoinBatchAccessor::shared, ParallelHashJoinState::space_allowed, and ParallelHashJoinBatch::space_exhausted.

Referenced by ExecParallelHashTableInsert().

◆ ExecParallelPrepHashTableForUnmatched()

bool ExecParallelPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2124 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    int         curbatch = hashtable->curbatch;
    ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
 
    Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE);
 
    /*
     * It would not be deadlock-free to wait on the batch barrier, because it
     * is in PHJ_BATCH_PROBE phase, and thus processes attached to it have
     * already emitted tuples.  Therefore, we'll hold a wait-free election:
     * only one process can continue to the next phase, and all others detach
     * from this batch.  They can still go any work on other batches, if there
     * are any.
     */
    if (!BarrierArriveAndDetachExceptLast(&batch->batch_barrier))
    {
        /* This process considers the batch to be done. */
        hashtable->batches[hashtable->curbatch].done = true;
 
        /* Make sure any temporary files are closed. */
        sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
        sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
 
        /*
         * Track largest batch we've seen, which would normally happen in
         * ExecHashTableDetachBatch().
         */
        hashtable->spacePeak =
            Max(hashtable->spacePeak,
                batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
        hashtable->curbatch = -1;
        return false;
    }
 
    /* Now we are alone with this batch. */
    Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
 
    /*
     * Has another process decided to give up early and command all processes
     * to skip the unmatched scan?
     */
    if (batch->skip_unmatched)
    {
        hashtable->batches[hashtable->curbatch].done = true;
        ExecHashTableDetachBatch(hashtable);
        return false;
    }
 
    /* Now prepare the process local state, just as for non-parallel join. */
    ExecPrepHashTableForUnmatched(hjstate);
 
    return true;
}

References Assert(), BarrierArriveAndDetachExceptLast(), BarrierPhase(), ParallelHashJoinBatch::batch_barrier, HashJoinTableData::batches, HashJoinTableData::curbatch, ParallelHashJoinBatchAccessor::done, ExecHashTableDetachBatch(), ExecPrepHashTableForUnmatched(), HashJoinState::hj_HashTable, ParallelHashJoinBatchAccessor::inner_tuples, Max, HashJoinTableData::nbuckets, ParallelHashJoinBatchAccessor::outer_tuples, PHJ_BATCH_PROBE, PHJ_BATCH_SCAN, ParallelHashJoinBatchAccessor::shared, ParallelHashJoinBatch::size, ParallelHashJoinBatch::skip_unmatched, HashJoinTableData::spacePeak, and sts_end_parallel_scan().

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashBucket()

bool ExecParallelScanHashBucket	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2052 of file nodeHash.c.

{
    ExprState  *hjclauses = hjstate->hashclauses;
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
    uint32      hashvalue = hjstate->hj_CurHashValue;
 
    /*
     * hj_CurTuple is the address of the tuple last returned from the current
     * bucket, or NULL if it's time to start scanning a new bucket.
     */
    if (hashTuple != NULL)
        hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
    else
        hashTuple = ExecParallelHashFirstTuple(hashtable,
                                               hjstate->hj_CurBucketNo);
 
    while (hashTuple != NULL)
    {
        if (hashTuple->hashvalue == hashvalue)
        {
            TupleTableSlot *inntuple;
 
            /* insert hashtable's tuple into exec slot so ExecQual sees it */
            inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                             hjstate->hj_HashTupleSlot,
                                             false);    /* do not pfree */
            econtext->ecxt_innertuple = inntuple;
 
            if (ExecQualAndReset(hjclauses, econtext))
            {
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
        }
 
        hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
    }
 
    /*
     * no match
     */
    return false;
}

References ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, and HJTUPLE_MINTUPLE.

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashTableForUnmatched()

bool ExecParallelScanHashTableForUnmatched	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2263 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 
    for (;;)
    {
        /*
         * hj_CurTuple is the address of the tuple last returned from the
         * current bucket, or NULL if it's time to start scanning a new
         * bucket.
         */
        if (hashTuple != NULL)
            hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
        else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
            hashTuple = ExecParallelHashFirstTuple(hashtable,
                                                   hjstate->hj_CurBucketNo++);
        else
            break;              /* finished all buckets */
 
        while (hashTuple != NULL)
        {
            if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
            {
                TupleTableSlot *inntuple;
 
                /* insert hashtable's tuple into exec slot */
                inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                                 hjstate->hj_HashTupleSlot,
                                                 false);    /* do not pfree */
                econtext->ecxt_innertuple = inntuple;
 
                /*
                 * Reset temp memory each time; although this function doesn't
                 * do any qual eval, the caller will, so let's keep it
                 * parallel to ExecScanHashBucket.
                 */
                ResetExprContext(econtext);
 
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
 
            hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
        }
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
 
    /*
     * no more unmatched tuples
     */
    return false;
}

References CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, HashJoinTableData::nbuckets, and ResetExprContext.

Referenced by ExecHashJoinImpl().

◆ ExecPrepHashTableForUnmatched()

void ExecPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2103 of file nodeHash.c.

{
    /*----------
     * During this scan we use the HashJoinState fields as follows:
     *
     * hj_CurBucketNo: next regular bucket to scan
     * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)
     * hj_CurTuple: last tuple returned, or NULL to start next bucket
     *----------
     */
    hjstate->hj_CurBucketNo = 0;
    hjstate->hj_CurSkewBucketNo = 0;
    hjstate->hj_CurTuple = NULL;
}

References HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, and HashJoinState::hj_CurTuple.

Referenced by ExecHashJoinImpl(), and ExecParallelPrepHashTableForUnmatched().

◆ ExecReScanHash()

void ExecReScanHash ( HashState * node )

Definition at line 2380 of file nodeHash.c.

{
    PlanState  *outerPlan = outerPlanState(node);
 
    /*
     * if chgParam of subnode is not null then plan will be re-scanned by
     * first ExecProcNode.
     */
    if (outerPlan->chgParam == NULL)
        ExecReScan(outerPlan);
}

References ExecReScan(), outerPlan, and outerPlanState.

Referenced by ExecReScan().

◆ ExecScanHashBucket()

bool ExecScanHashBucket	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 1991 of file nodeHash.c.

{
    ExprState  *hjclauses = hjstate->hashclauses;
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
    uint32      hashvalue = hjstate->hj_CurHashValue;
 
    /*
     * hj_CurTuple is the address of the tuple last returned from the current
     * bucket, or NULL if it's time to start scanning a new bucket.
     *
     * If the tuple hashed to a skew bucket then scan the skew bucket
     * otherwise scan the standard hashtable bucket.
     */
    if (hashTuple != NULL)
        hashTuple = hashTuple->next.unshared;
    else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
        hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
    else
        hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
 
    while (hashTuple != NULL)
    {
        if (hashTuple->hashvalue == hashvalue)
        {
            TupleTableSlot *inntuple;
 
            /* insert hashtable's tuple into exec slot so ExecQual sees it */
            inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                             hjstate->hj_HashTupleSlot,
                                             false);    /* do not pfree */
            econtext->ecxt_innertuple = inntuple;
 
            if (ExecQualAndReset(hjclauses, econtext))
            {
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
        }
 
        hashTuple = hashTuple->next.unshared;
    }
 
    /*
     * no match
     */
    return false;
}

References HashJoinTableData::buckets, ExprContext::ecxt_innertuple, ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, INVALID_SKEW_BUCKET_NO, HashJoinTupleData::next, HashJoinTableData::skewBucket, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecScanHashTableForUnmatched()

bool ExecScanHashTableForUnmatched	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2189 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 
    for (;;)
    {
        /*
         * hj_CurTuple is the address of the tuple last returned from the
         * current bucket, or NULL if it's time to start scanning a new
         * bucket.
         */
        if (hashTuple != NULL)
            hashTuple = hashTuple->next.unshared;
        else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
        {
            hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
            hjstate->hj_CurBucketNo++;
        }
        else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
        {
            int         j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
 
            hashTuple = hashtable->skewBucket[j]->tuples;
            hjstate->hj_CurSkewBucketNo++;
        }
        else
            break;              /* finished all buckets */
 
        while (hashTuple != NULL)
        {
            if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
            {
                TupleTableSlot *inntuple;
 
                /* insert hashtable's tuple into exec slot */
                inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                                 hjstate->hj_HashTupleSlot,
                                                 false);    /* do not pfree */
                econtext->ecxt_innertuple = inntuple;
 
                /*
                 * Reset temp memory each time; although this function doesn't
                 * do any qual eval, the caller will, so let's keep it
                 * parallel to ExecScanHashBucket.
                 */
                ResetExprContext(econtext);
 
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
 
            hashTuple = hashTuple->next.unshared;
        }
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
 
    /*
     * no more unmatched tuples
     */
    return false;
}

References HashJoinTableData::buckets, CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, ResetExprContext, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecShutdownHash()

void ExecShutdownHash ( HashState * node )

Definition at line 2830 of file nodeHash.c.

{
    /* Allocate save space if EXPLAIN'ing and we didn't do so already */
    if (node->ps.instrument && !node->hinstrument)
        node->hinstrument = palloc0_object(HashInstrumentation);
    /* Now accumulate data for the current (final) hash table */
    if (node->hinstrument && node->hashtable)
        ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);
}

References ExecHashAccumInstrumentation(), HashState::hashtable, HashState::hinstrument, PlanState::instrument, palloc0_object, and HashState::ps.

Referenced by ExecShutdownNode_walker().

◆ get_hash_memory_limit()

size_t get_hash_memory_limit ( void )

Definition at line 3621 of file nodeHash.c.

{
    double      mem_limit;
 
    /* Do initial calculation in double arithmetic */
    mem_limit = (double) work_mem * hash_mem_multiplier * 1024.0;
 
    /* Clamp in case it doesn't fit in size_t */
    mem_limit = Min(mem_limit, (double) SIZE_MAX);
 
    return (size_t) mem_limit;
}

References hash_mem_multiplier, Min, and work_mem.

Referenced by consider_groupingsets_paths(), cost_memoize_rescan(), create_setop_path(), ExecChooseHashTableSize(), ExecInitMemoize(), ExecParallelHashIncreaseNumBatches(), final_cost_hashjoin(), hash_agg_set_limits(), hash_choose_num_partitions(), subpath_is_hashable(), and subplan_is_hashable().

◆ MultiExecHash()

Node * MultiExecHash ( HashState * node )

Definition at line 104 of file nodeHash.c.

{
    /* must provide our own instrumentation support */
    if (node->ps.instrument)
        InstrStartNode(node->ps.instrument);
 
    if (node->parallel_state != NULL)
        MultiExecParallelHash(node);
    else
        MultiExecPrivateHash(node);
 
    /* must provide our own instrumentation support */
    if (node->ps.instrument)
        InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
 
    /*
     * We do not return the hash table directly because it's not a subtype of
     * Node, and so would violate the MultiExecProcNode API.  Instead, our
     * parent Hashjoin node is expected to know how to fish it out of our node
     * state.  Ugly but not really worth cleaning up, since Hashjoin knows
     * quite a bit more about Hash besides that.
     */
    return NULL;
}

References HashState::hashtable, InstrStartNode(), InstrStopNode(), PlanState::instrument, MultiExecParallelHash(), MultiExecPrivateHash(), HashState::parallel_state, HashJoinTableData::partialTuples, and HashState::ps.

Referenced by MultiExecProcNode().

◆ MultiExecParallelHash()

static void MultiExecParallelHash ( HashState * node )

static

Definition at line 218 of file nodeHash.c.

{
    ParallelHashJoinState *pstate;
    PlanState  *outerNode;
    HashJoinTable hashtable;
    TupleTableSlot *slot;
    ExprContext *econtext;
    uint32      hashvalue;
    Barrier    *build_barrier;
    int         i;
 
    /*
     * get state info from node
     */
    outerNode = outerPlanState(node);
    hashtable = node->hashtable;
 
    /*
     * set expression context
     */
    econtext = node->ps.ps_ExprContext;
 
    /*
     * Synchronize the parallel hash table build.  At this stage we know that
     * the shared hash table has been or is being set up by
     * ExecHashTableCreate(), but we don't know if our peers have returned
     * from there or are here in MultiExecParallelHash(), and if so how far
     * through they are.  To find out, we check the build_barrier phase then
     * and jump to the right step in the build algorithm.
     */
    pstate = hashtable->parallel_state;
    build_barrier = &pstate->build_barrier;
    Assert(BarrierPhase(build_barrier) >= PHJ_BUILD_ALLOCATE);
    switch (BarrierPhase(build_barrier))
    {
        case PHJ_BUILD_ALLOCATE:
 
            /*
             * Either I just allocated the initial hash table in
             * ExecHashTableCreate(), or someone else is doing that.  Either
             * way, wait for everyone to arrive here so we can proceed.
             */
            BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ALLOCATE);
            /* Fall through. */
 
        case PHJ_BUILD_HASH_INNER:
 
            /*
             * It's time to begin hashing, or if we just arrived here then
             * hashing is already underway, so join in that effort.  While
             * hashing we have to be prepared to help increase the number of
             * batches or buckets at any time, and if we arrived here when
             * that was already underway we'll have to help complete that work
             * immediately so that it's safe to access batches and buckets
             * below.
             */
            if (PHJ_GROW_BATCHES_PHASE(BarrierAttach(&pstate->grow_batches_barrier)) !=
                PHJ_GROW_BATCHES_ELECT)
                ExecParallelHashIncreaseNumBatches(hashtable);
            if (PHJ_GROW_BUCKETS_PHASE(BarrierAttach(&pstate->grow_buckets_barrier)) !=
                PHJ_GROW_BUCKETS_ELECT)
                ExecParallelHashIncreaseNumBuckets(hashtable);
            ExecParallelHashEnsureBatchAccessors(hashtable);
            ExecParallelHashTableSetCurrentBatch(hashtable, 0);
            for (;;)
            {
                bool        isnull;
 
                slot = ExecProcNode(outerNode);
                if (TupIsNull(slot))
                    break;
                econtext->ecxt_outertuple = slot;
 
                ResetExprContext(econtext);
 
                hashvalue = DatumGetUInt32(ExecEvalExprSwitchContext(node->hash_expr,
                                                                     econtext,
                                                                     &isnull));
 
                if (!isnull)
                    ExecParallelHashTableInsert(hashtable, slot, hashvalue);
                hashtable->partialTuples++;
            }
 
            /*
             * Make sure that any tuples we wrote to disk are visible to
             * others before anyone tries to load them.
             */
            for (i = 0; i < hashtable->nbatch; ++i)
                sts_end_write(hashtable->batches[i].inner_tuples);
 
            /*
             * Update shared counters.  We need an accurate total tuple count
             * to control the empty table optimization.
             */
            ExecParallelHashMergeCounters(hashtable);
 
            BarrierDetach(&pstate->grow_buckets_barrier);
            BarrierDetach(&pstate->grow_batches_barrier);
 
            /*
             * Wait for everyone to finish building and flushing files and
             * counters.
             */
            if (BarrierArriveAndWait(build_barrier,
                                     WAIT_EVENT_HASH_BUILD_HASH_INNER))
            {
                /*
                 * Elect one backend to disable any further growth.  Batches
                 * are now fixed.  While building them we made sure they'd fit
                 * in our memory budget when we load them back in later (or we
                 * tried to do that and gave up because we detected extreme
                 * skew).
                 */
                pstate->growth = PHJ_GROWTH_DISABLED;
            }
    }
 
    /*
     * We're not yet attached to a batch.  We all agree on the dimensions and
     * number of inner tuples (for the empty table optimization).
     */
    hashtable->curbatch = -1;
    hashtable->nbuckets = pstate->nbuckets;
    hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);
    hashtable->totalTuples = pstate->total_tuples;
 
    /*
     * Unless we're completely done and the batch state has been freed, make
     * sure we have accessors.
     */
    if (BarrierPhase(build_barrier) < PHJ_BUILD_FREE)
        ExecParallelHashEnsureBatchAccessors(hashtable);
 
    /*
     * The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
     * case, which will bring the build phase to PHJ_BUILD_RUN (if it isn't
     * there already).
     */
    Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASH_OUTER ||
           BarrierPhase(build_barrier) == PHJ_BUILD_RUN ||
           BarrierPhase(build_barrier) == PHJ_BUILD_FREE);
}

References Assert(), BarrierArriveAndWait(), BarrierAttach(), BarrierDetach(), BarrierPhase(), HashJoinTableData::batches, ParallelHashJoinState::build_barrier, HashJoinTableData::curbatch, DatumGetUInt32(), ExprContext::ecxt_outertuple, ExecEvalExprSwitchContext(), ExecParallelHashEnsureBatchAccessors(), ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashMergeCounters(), ExecParallelHashTableInsert(), ExecParallelHashTableSetCurrentBatch(), ExecProcNode(), ParallelHashJoinState::grow_batches_barrier, ParallelHashJoinState::grow_buckets_barrier, ParallelHashJoinState::growth, HashState::hash_expr, HashState::hashtable, i, ParallelHashJoinBatchAccessor::inner_tuples, HashJoinTableData::log2_nbuckets, HashJoinTableData::nbatch, ParallelHashJoinState::nbuckets, HashJoinTableData::nbuckets, outerPlanState, HashJoinTableData::parallel_state, HashJoinTableData::partialTuples, pg_ceil_log2_32(), PHJ_BUILD_ALLOCATE, PHJ_BUILD_FREE, PHJ_BUILD_HASH_INNER, PHJ_BUILD_HASH_OUTER, PHJ_BUILD_RUN, PHJ_GROW_BATCHES_ELECT, PHJ_GROW_BATCHES_PHASE, PHJ_GROW_BUCKETS_ELECT, PHJ_GROW_BUCKETS_PHASE, PHJ_GROWTH_DISABLED, HashState::ps, PlanState::ps_ExprContext, ResetExprContext, sts_end_write(), ParallelHashJoinState::total_tuples, HashJoinTableData::totalTuples, and TupIsNull.

Referenced by MultiExecHash().

◆ MultiExecPrivateHash()

static void MultiExecPrivateHash ( HashState * node )

static

Definition at line 137 of file nodeHash.c.

{
    PlanState  *outerNode;
    HashJoinTable hashtable;
    TupleTableSlot *slot;
    ExprContext *econtext;
 
    /*
     * get state info from node
     */
    outerNode = outerPlanState(node);
    hashtable = node->hashtable;
 
    /*
     * set expression context
     */
    econtext = node->ps.ps_ExprContext;
 
    /*
     * Get all tuples from the node below the Hash node and insert into the
     * hash table (or temp files).
     */
    for (;;)
    {
        bool        isnull;
        Datum       hashdatum;
 
        slot = ExecProcNode(outerNode);
        if (TupIsNull(slot))
            break;
        /* We have to compute the hash value */
        econtext->ecxt_outertuple = slot;
 
        ResetExprContext(econtext);
 
        hashdatum = ExecEvalExprSwitchContext(node->hash_expr, econtext,
                                              &isnull);
 
        if (!isnull)
        {
            uint32      hashvalue = DatumGetUInt32(hashdatum);
            int         bucketNumber;
 
            bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
            if (bucketNumber != INVALID_SKEW_BUCKET_NO)
            {
                /* It's a skew tuple, so put it into that hash table */
                ExecHashSkewTableInsert(hashtable, slot, hashvalue,
                                        bucketNumber);
                hashtable->skewTuples += 1;
            }
            else
            {
                /* Not subject to skew optimization, so insert normally */
                ExecHashTableInsert(hashtable, slot, hashvalue);
            }
            hashtable->totalTuples += 1;
        }
    }
 
    /* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
    if (hashtable->nbuckets != hashtable->nbuckets_optimal)
        ExecHashIncreaseNumBuckets(hashtable);
 
    /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
    hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
    if (hashtable->spaceUsed > hashtable->spacePeak)
        hashtable->spacePeak = hashtable->spaceUsed;
 
    hashtable->partialTuples = hashtable->totalTuples;
}

References DatumGetUInt32(), ExprContext::ecxt_outertuple, ExecEvalExprSwitchContext(), ExecHashGetSkewBucket(), ExecHashIncreaseNumBuckets(), ExecHashSkewTableInsert(), ExecHashTableInsert(), ExecProcNode(), HashState::hash_expr, HashState::hashtable, INVALID_SKEW_BUCKET_NO, HashJoinTableData::nbuckets, HashJoinTableData::nbuckets_optimal, outerPlanState, HashJoinTableData::partialTuples, HashState::ps, PlanState::ps_ExprContext, ResetExprContext, HashJoinTableData::skewTuples, HashJoinTableData::spacePeak, HashJoinTableData::spaceUsed, HashJoinTableData::totalTuples, and TupIsNull.

Referenced by MultiExecHash().

Macros

Functions

Macro Definition Documentation

◆ NTUP_PER_BUCKET

Function Documentation

◆ dense_alloc()

◆ ExecChooseHashTableSize()

◆ ExecEndHash()

◆ ExecHash()

◆ ExecHashAccumInstrumentation()

◆ ExecHashBuildSkewHash()

◆ ExecHashEstimate()

◆ ExecHashGetBucketAndBatch()

◆ ExecHashGetSkewBucket()

◆ ExecHashIncreaseBatchSize()

◆ ExecHashIncreaseNumBatches()

◆ ExecHashIncreaseNumBuckets()

◆ ExecHashInitializeDSM()

◆ ExecHashInitializeWorker()

◆ ExecHashRemoveNextSkewBucket()

◆ ExecHashRetrieveInstrumentation()

◆ ExecHashSkewTableInsert()

◆ ExecHashTableCreate()

◆ ExecHashTableDestroy()

◆ ExecHashTableDetach()

◆ ExecHashTableDetachBatch()

◆ ExecHashTableInsert()

◆ ExecHashTableReset()

◆ ExecHashTableResetMatchFlags()

◆ ExecInitHash()

◆ ExecParallelHashCloseBatchAccessors()

◆ ExecParallelHashEnsureBatchAccessors()

◆ ExecParallelHashFirstTuple()

◆ ExecParallelHashIncreaseNumBatches()

◆ ExecParallelHashIncreaseNumBuckets()

◆ ExecParallelHashJoinSetUpBatches()

◆ ExecParallelHashMergeCounters()

◆ ExecParallelHashNextTuple()

◆ ExecParallelHashPopChunkQueue()

◆ ExecParallelHashPushTuple()

◆ ExecParallelHashRepartitionFirst()

◆ ExecParallelHashRepartitionRest()

◆ ExecParallelHashTableAlloc()

◆ ExecParallelHashTableInsert()

◆ ExecParallelHashTableInsertCurrentBatch()

◆ ExecParallelHashTableSetCurrentBatch()

◆ ExecParallelHashTupleAlloc()

◆ ExecParallelHashTuplePrealloc()

◆ ExecParallelPrepHashTableForUnmatched()

◆ ExecParallelScanHashBucket()

◆ ExecParallelScanHashTableForUnmatched()

◆ ExecPrepHashTableForUnmatched()

◆ ExecReScanHash()

◆ ExecScanHashBucket()

◆ ExecScanHashTableForUnmatched()

◆ ExecShutdownHash()

◆ get_hash_memory_limit()

◆ MultiExecHash()

◆ MultiExecParallelHash()

◆ MultiExecPrivateHash()