#include "access/parallel.h"
#include "nodes/execnodes.h"

Include dependency graph for nodeHash.h:

This graph shows which files directly or indirectly include this file:

Functions
HashState *	ExecInitHash (Hash node, EState estate, int eflags)

Node *	MultiExecHash (HashState *node)

void	ExecEndHash (HashState *node)

void	ExecReScanHash (HashState *node)

HashJoinTable	ExecHashTableCreate (HashState *state)

void	ExecParallelHashTableAlloc (HashJoinTable hashtable, int batchno)

void	ExecHashTableDestroy (HashJoinTable hashtable)

void	ExecHashTableDetach (HashJoinTable hashtable)

void	ExecHashTableDetachBatch (HashJoinTable hashtable)

void	ExecParallelHashTableSetCurrentBatch (HashJoinTable hashtable, int batchno)

void	ExecHashTableInsert (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecParallelHashTableInsert (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecParallelHashTableInsertCurrentBatch (HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue)

void	ExecHashGetBucketAndBatch (HashJoinTable hashtable, uint32 hashvalue, int bucketno, int batchno)

bool	ExecScanHashBucket (HashJoinState hjstate, ExprContext econtext)

bool	ExecParallelScanHashBucket (HashJoinState hjstate, ExprContext econtext)

void	ExecPrepHashTableForUnmatched (HashJoinState *hjstate)

bool	ExecParallelPrepHashTableForUnmatched (HashJoinState *hjstate)

bool	ExecScanHashTableForUnmatched (HashJoinState hjstate, ExprContext econtext)

bool	ExecParallelScanHashTableForUnmatched (HashJoinState hjstate, ExprContext econtext)

void	ExecHashTableReset (HashJoinTable hashtable)

void	ExecHashTableResetMatchFlags (HashJoinTable hashtable)

void	ExecChooseHashTableSize (double ntuples, int tupwidth, bool useskew, bool try_combined_hash_mem, int parallel_workers, size_t space_allowed, int numbuckets, int numbatches, int num_skew_mcvs)

int	ExecHashGetSkewBucket (HashJoinTable hashtable, uint32 hashvalue)

void	ExecHashEstimate (HashState node, ParallelContext pcxt)

void	ExecHashInitializeDSM (HashState node, ParallelContext pcxt)

void	ExecHashInitializeWorker (HashState node, ParallelWorkerContext pwcxt)

void	ExecHashRetrieveInstrumentation (HashState *node)

void	ExecShutdownHash (HashState *node)

void	ExecHashAccumInstrumentation (HashInstrumentation *instrument, HashJoinTable hashtable)

Function Documentation

◆ ExecChooseHashTableSize()

void ExecChooseHashTableSize	(	double	ntuples,
		int	tupwidth,
		bool	useskew,
		bool	try_combined_hash_mem,
		int	parallel_workers,
		size_t *	space_allowed,
		int *	numbuckets,
		int *	numbatches,
		int *	num_skew_mcvs
	)

Definition at line 657 of file nodeHash.c.

{
    int         tupsize;
    double      inner_rel_bytes;
    size_t      hash_table_bytes;
    size_t      bucket_bytes;
    size_t      max_pointers;
    int         nbatch = 1;
    int         nbuckets;
    double      dbuckets;
 
    /* Force a plausible relation size if no info */
    if (ntuples <= 0.0)
        ntuples = 1000.0;
 
    /*
     * Estimate tupsize based on footprint of tuple in hashtable... note this
     * does not allow for any palloc overhead.  The manipulations of spaceUsed
     * don't count palloc overhead either.
     */
    tupsize = HJTUPLE_OVERHEAD +
        MAXALIGN(SizeofMinimalTupleHeader) +
        MAXALIGN(tupwidth);
    inner_rel_bytes = ntuples * tupsize;
 
    /*
     * Compute in-memory hashtable size limit from GUCs.
     */
    hash_table_bytes = get_hash_memory_limit();
 
    /*
     * Parallel Hash tries to use the combined hash_mem of all workers to
     * avoid the need to batch.  If that won't work, it falls back to hash_mem
     * per worker and tries to process batches in parallel.
     */
    if (try_combined_hash_mem)
    {
        /* Careful, this could overflow size_t */
        double      newlimit;
 
        newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);
        newlimit = Min(newlimit, (double) SIZE_MAX);
        hash_table_bytes = (size_t) newlimit;
    }
 
    *space_allowed = hash_table_bytes;
 
    /*
     * If skew optimization is possible, estimate the number of skew buckets
     * that will fit in the memory allowed, and decrement the assumed space
     * available for the main hash table accordingly.
     *
     * We make the optimistic assumption that each skew bucket will contain
     * one inner-relation tuple.  If that turns out to be low, we will recover
     * at runtime by reducing the number of skew buckets.
     *
     * hashtable->skewBucket will have up to 8 times as many HashSkewBucket
     * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
     * will round up to the next power of 2 and then multiply by 4 to reduce
     * collisions.
     */
    if (useskew)
    {
        size_t      bytes_per_mcv;
        size_t      skew_mcvs;
 
        /*----------
         * Compute number of MCVs we could hold in hash_table_bytes
         *
         * Divisor is:
         * size of a hash tuple +
         * worst-case size of skewBucket[] per MCV +
         * size of skewBucketNums[] entry +
         * size of skew bucket struct itself
         *----------
         */
        bytes_per_mcv = tupsize +
            (8 * sizeof(HashSkewBucket *)) +
            sizeof(int) +
            SKEW_BUCKET_OVERHEAD;
        skew_mcvs = hash_table_bytes / bytes_per_mcv;
 
        /*
         * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as
         * not to worry about size_t overflow in the multiplication)
         */
        skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;
 
        /* Now clamp to integer range */
        skew_mcvs = Min(skew_mcvs, INT_MAX);
 
        *num_skew_mcvs = (int) skew_mcvs;
 
        /* Reduce hash_table_bytes by the amount needed for the skew table */
        if (skew_mcvs > 0)
            hash_table_bytes -= skew_mcvs * bytes_per_mcv;
    }
    else
        *num_skew_mcvs = 0;
 
    /*
     * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
     * memory is filled, assuming a single batch; but limit the value so that
     * the pointer arrays we'll try to allocate do not exceed hash_table_bytes
     * nor MaxAllocSize.
     *
     * Note that both nbuckets and nbatch must be powers of 2 to make
     * ExecHashGetBucketAndBatch fast.
     */
    max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
    max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
    /* If max_pointers isn't a power of 2, must round it down to one */
    max_pointers = pg_prevpower2_size_t(max_pointers);
 
    /* Also ensure we avoid integer overflow in nbatch and nbuckets */
    /* (this step is redundant given the current value of MaxAllocSize) */
    max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
 
    dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
    dbuckets = Min(dbuckets, max_pointers);
    nbuckets = (int) dbuckets;
    /* don't let nbuckets be really small, though ... */
    nbuckets = Max(nbuckets, 1024);
    /* ... and force it to be a power of 2. */
    nbuckets = pg_nextpower2_32(nbuckets);
 
    /*
     * If there's not enough space to store the projected number of tuples and
     * the required bucket headers, we will need multiple batches.
     */
    bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
    if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
    {
        /* We'll need multiple batches */
        size_t      sbuckets;
        double      dbatch;
        int         minbatch;
        size_t      bucket_size;
 
        /*
         * If Parallel Hash with combined hash_mem would still need multiple
         * batches, we'll have to fall back to regular hash_mem budget.
         */
        if (try_combined_hash_mem)
        {
            ExecChooseHashTableSize(ntuples, tupwidth, useskew,
                                    false, parallel_workers,
                                    space_allowed,
                                    numbuckets,
                                    numbatches,
                                    num_skew_mcvs);
            return;
        }
 
        /*
         * Estimate the number of buckets we'll want to have when hash_mem is
         * entirely full.  Each bucket will contain a bucket pointer plus
         * NTUP_PER_BUCKET tuples, whose projected size already includes
         * overhead for the hash code, pointer to the next tuple, etc.
         */
        bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
        if (hash_table_bytes <= bucket_size)
            sbuckets = 1;       /* avoid pg_nextpower2_size_t(0) */
        else
            sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);
        sbuckets = Min(sbuckets, max_pointers);
        nbuckets = (int) sbuckets;
        nbuckets = pg_nextpower2_32(nbuckets);
        bucket_bytes = nbuckets * sizeof(HashJoinTuple);
 
        /*
         * Buckets are simple pointers to hashjoin tuples, while tupsize
         * includes the pointer, hash code, and MinimalTupleData.  So buckets
         * should never really exceed 25% of hash_mem (even for
         * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
         * 2^N bytes, where we might get more because of doubling. So let's
         * look for 50% here.
         */
        Assert(bucket_bytes <= hash_table_bytes / 2);
 
        /* Calculate required number of batches. */
        dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
        dbatch = Min(dbatch, max_pointers);
        minbatch = (int) dbatch;
        nbatch = pg_nextpower2_32(Max(2, minbatch));
    }
 
    /*
     * Optimize the total amount of memory consumed by the hash node.
     *
     * The nbatch calculation above focuses on the in-memory hash table,
     * assuming no per-batch overhead. But each batch may have two files, each
     * with a BLCKSZ buffer. For large nbatch values these buffers may use
     * significantly more memory than the hash table.
     *
     * The total memory usage may be expressed by this formula:
     *
     * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ)
     *
     * where (inner_rel_bytes / nbatch) is the size of the in-memory hash
     * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file
     * buffers.
     *
     * The nbatch calculation however ignores the second part. And for very
     * large inner_rel_bytes, there may be no nbatch that keeps total memory
     * usage under the budget (work_mem * hash_mem_multiplier). To deal with
     * that, we will adjust nbatch to minimize total memory consumption across
     * both the hashtable and file buffers.
     *
     * As we increase the size of the hashtable, the number of batches
     * decreases, and the total memory usage follows a U-shaped curve. We find
     * the minimum nbatch by "walking back" -- checking if halving nbatch
     * would lower the total memory usage. We stop when it no longer helps.
     *
     * We only reduce the number of batches. Adding batches reduces memory
     * usage only when most of the memory is used by the hash table, with
     * total memory usage within the limit or not far from it. We don't want
     * to start batching when not needed, even if that would reduce memory
     * usage.
     *
     * While growing the hashtable, we also adjust the number of buckets to
     * maintain a load factor of NTUP_PER_BUCKET while squeezing tuples back
     * from batches into the hashtable.
     *
     * Note that we can only change nbuckets during initial hashtable sizing.
     * Once we start building the hash, nbuckets is fixed (we may still grow
     * the hash table).
     *
     * We double several parameters (space_allowed, nbuckets, num_skew_mcvs),
     * which introduces a risk of overflow. We avoid this by exiting the loop.
     * We could do something smarter (e.g. capping nbuckets and continue), but
     * the complexity is not worth it. Such cases are extremely rare, and this
     * is a best-effort attempt to reduce memory usage.
     */
    while (nbatch > 1)
    {
        /* Check that buckets won't overflow MaxAllocSize */
        if (nbuckets > (MaxAllocSize / sizeof(HashJoinTuple) / 2))
            break;
 
        /* num_skew_mcvs should be less than nbuckets */
        Assert((*num_skew_mcvs) < (INT_MAX / 2));
 
        /*
         * Check that space_allowed won't overflow SIZE_MAX.
         *
         * We don't use hash_table_bytes here, because it does not include the
         * skew buckets. And we want to limit the overall memory limit.
         */
        if ((*space_allowed) > (SIZE_MAX / 2))
            break;
 
        /*
         * Will halving the number of batches and doubling the size of the
         * hashtable reduce overall memory usage?
         *
         * This is the same as (S = space_allowed):
         *
         * (S + 2 * nbatch * BLCKSZ) < (S * 2 + nbatch * BLCKSZ)
         *
         * but avoiding intermediate overflow.
         */
        if (nbatch < (*space_allowed) / BLCKSZ)
            break;
 
        /*
         * MaxAllocSize is sufficiently small that we are not worried about
         * overflowing nbuckets.
         */
        nbuckets *= 2;
 
        *num_skew_mcvs = (*num_skew_mcvs) * 2;
        *space_allowed = (*space_allowed) * 2;
 
        nbatch /= 2;
    }
 
    Assert(nbuckets > 0);
    Assert(nbatch > 0);
 
    *numbuckets = nbuckets;
    *numbatches = nbatch;
}

References Assert(), ExecChooseHashTableSize(), get_hash_memory_limit(), HJTUPLE_OVERHEAD, Max, MAXALIGN, MaxAllocSize, Min, NTUP_PER_BUCKET, pg_nextpower2_32(), pg_nextpower2_size_t, pg_prevpower2_size_t, SizeofMinimalTupleHeader, SKEW_BUCKET_OVERHEAD, and SKEW_HASH_MEM_PERCENT.

Referenced by ExecChooseHashTableSize(), ExecHashTableCreate(), and initial_cost_hashjoin().

◆ ExecEndHash()

void ExecEndHash ( HashState * node )

Definition at line 426 of file nodeHash.c.

{
    PlanState  *outerPlan;
 
    /*
     * shut down the subplan
     */
    outerPlan = outerPlanState(node);
    ExecEndNode(outerPlan);
}

References ExecEndNode(), outerPlan, and outerPlanState.

Referenced by ExecEndNode().

◆ ExecHashAccumInstrumentation()

void ExecHashAccumInstrumentation	(	HashInstrumentation *	instrument,
		HashJoinTable	hashtable
	)

Definition at line 2876 of file nodeHash.c.

{
    instrument->nbuckets = Max(instrument->nbuckets,
                               hashtable->nbuckets);
    instrument->nbuckets_original = Max(instrument->nbuckets_original,
                                        hashtable->nbuckets_original);
    instrument->nbatch = Max(instrument->nbatch,
                             hashtable->nbatch);
    instrument->nbatch_original = Max(instrument->nbatch_original,
                                      hashtable->nbatch_original);
    instrument->space_peak = Max(instrument->space_peak,
                                 hashtable->spacePeak);
}

References Max, HashJoinTableData::nbatch, HashInstrumentation::nbatch, HashJoinTableData::nbatch_original, HashInstrumentation::nbatch_original, HashJoinTableData::nbuckets, HashInstrumentation::nbuckets, HashJoinTableData::nbuckets_original, HashInstrumentation::nbuckets_original, HashInstrumentation::space_peak, and HashJoinTableData::spacePeak.

Referenced by ExecReScanHashJoin(), and ExecShutdownHash().

◆ ExecHashEstimate()

void ExecHashEstimate	(	HashState *	node,
		ParallelContext *	pcxt
	)

Definition at line 2760 of file nodeHash.c.

{
    size_t      size;
 
    /* don't need this if not instrumenting or no workers */
    if (!node->ps.instrument || pcxt->nworkers == 0)
        return;
 
    size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
    size = add_size(size, offsetof(SharedHashInfo, hinstrument));
    shm_toc_estimate_chunk(&pcxt->estimator, size);
    shm_toc_estimate_keys(&pcxt->estimator, 1);
}

References add_size(), ParallelContext::estimator, PlanState::instrument, mul_size(), ParallelContext::nworkers, HashState::ps, shm_toc_estimate_chunk, and shm_toc_estimate_keys.

Referenced by ExecParallelEstimate().

◆ ExecHashGetBucketAndBatch()

void ExecHashGetBucketAndBatch	(	HashJoinTable	hashtable,
		uint32	hashvalue,
		int *	bucketno,
		int *	batchno
	)

Definition at line 1959 of file nodeHash.c.

{
    uint32      nbuckets = (uint32) hashtable->nbuckets;
    uint32      nbatch = (uint32) hashtable->nbatch;
 
    if (nbatch > 1)
    {
        *bucketno = hashvalue & (nbuckets - 1);
        *batchno = pg_rotate_right32(hashvalue,
                                     hashtable->log2_nbuckets) & (nbatch - 1);
    }
    else
    {
        *bucketno = hashvalue & (nbuckets - 1);
        *batchno = 0;
    }
}

References HashJoinTableData::log2_nbuckets, HashJoinTableData::nbatch, HashJoinTableData::nbuckets, and pg_rotate_right32().

Referenced by ExecHashIncreaseNumBatches(), ExecHashIncreaseNumBuckets(), ExecHashJoinImpl(), ExecHashRemoveNextSkewBucket(), ExecHashTableInsert(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinPartitionOuter(), ExecParallelHashRepartitionFirst(), ExecParallelHashRepartitionRest(), ExecParallelHashTableInsert(), and ExecParallelHashTableInsertCurrentBatch().

◆ ExecHashGetSkewBucket()

int ExecHashGetSkewBucket	(	HashJoinTable	hashtable,
		uint32	hashvalue
	)

Definition at line 2554 of file nodeHash.c.

{
    int         bucket;
 
    /*
     * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in
     * particular, this happens after the initial batch is done).
     */
    if (!hashtable->skewEnabled)
        return INVALID_SKEW_BUCKET_NO;
 
    /*
     * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.
     */
    bucket = hashvalue & (hashtable->skewBucketLen - 1);
 
    /*
     * While we have not hit a hole in the hashtable and have not hit the
     * desired bucket, we have collided with some other hash value, so try the
     * next bucket location.
     */
    while (hashtable->skewBucket[bucket] != NULL &&
           hashtable->skewBucket[bucket]->hashvalue != hashvalue)
        bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);
 
    /*
     * Found the desired bucket?
     */
    if (hashtable->skewBucket[bucket] != NULL)
        return bucket;
 
    /*
     * There must not be any hashtable entry for this hash value.
     */
    return INVALID_SKEW_BUCKET_NO;
}

References HashSkewBucket::hashvalue, INVALID_SKEW_BUCKET_NO, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketLen, and HashJoinTableData::skewEnabled.

Referenced by ExecHashJoinImpl(), and MultiExecPrivateHash().

◆ ExecHashInitializeDSM()

void ExecHashInitializeDSM	(	HashState *	node,
		ParallelContext *	pcxt
	)

Definition at line 2779 of file nodeHash.c.

{
    size_t      size;
 
    /* don't need this if not instrumenting or no workers */
    if (!node->ps.instrument || pcxt->nworkers == 0)
        return;
 
    size = offsetof(SharedHashInfo, hinstrument) +
        pcxt->nworkers * sizeof(HashInstrumentation);
    node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
 
    /* Each per-worker area must start out as zeroes. */
    memset(node->shared_info, 0, size);
 
    node->shared_info->num_workers = pcxt->nworkers;
    shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
                   node->shared_info);
}

References PlanState::instrument, SharedHashInfo::num_workers, ParallelContext::nworkers, PlanState::plan, Plan::plan_node_id, HashState::ps, HashState::shared_info, shm_toc_allocate(), shm_toc_insert(), and ParallelContext::toc.

Referenced by ExecParallelInitializeDSM().

◆ ExecHashInitializeWorker()

void ExecHashInitializeWorker	(	HashState *	node,
		ParallelWorkerContext *	pwcxt
	)

Definition at line 2804 of file nodeHash.c.

{
    SharedHashInfo *shared_info;
 
    /* don't need this if not instrumenting */
    if (!node->ps.instrument)
        return;
 
    /*
     * Find our entry in the shared area, and set up a pointer to it so that
     * we'll accumulate stats there when shutting down or rebuilding the hash
     * table.
     */
    shared_info = (SharedHashInfo *)
        shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
    node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
}

References SharedHashInfo::hinstrument, HashState::hinstrument, PlanState::instrument, ParallelWorkerNumber, PlanState::plan, Plan::plan_node_id, HashState::ps, shm_toc_lookup(), and ParallelWorkerContext::toc.

Referenced by ExecParallelInitializeWorker().

◆ ExecHashRetrieveInstrumentation()

void ExecHashRetrieveInstrumentation ( HashState * node )

Definition at line 2845 of file nodeHash.c.

{
    SharedHashInfo *shared_info = node->shared_info;
    size_t      size;
 
    if (shared_info == NULL)
        return;
 
    /* Replace node->shared_info with a copy in backend-local memory. */
    size = offsetof(SharedHashInfo, hinstrument) +
        shared_info->num_workers * sizeof(HashInstrumentation);
    node->shared_info = palloc(size);
    memcpy(node->shared_info, shared_info, size);
}

References SharedHashInfo::num_workers, palloc(), and HashState::shared_info.

Referenced by ExecParallelRetrieveInstrumentation().

◆ ExecHashTableCreate()

HashJoinTable ExecHashTableCreate ( HashState * state )

Definition at line 445 of file nodeHash.c.

{
    Hash       *node;
    HashJoinTable hashtable;
    Plan       *outerNode;
    size_t      space_allowed;
    int         nbuckets;
    int         nbatch;
    double      rows;
    int         num_skew_mcvs;
    int         log2_nbuckets;
    MemoryContext oldcxt;
 
    /*
     * Get information about the size of the relation to be hashed (it's the
     * "outer" subtree of this node, but the inner relation of the hashjoin).
     * Compute the appropriate size of the hash table.
     */
    node = (Hash *) state->ps.plan;
    outerNode = outerPlan(node);
 
    /*
     * If this is shared hash table with a partial plan, then we can't use
     * outerNode->plan_rows to estimate its size.  We need an estimate of the
     * total number of rows across all copies of the partial plan.
     */
    rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
 
    ExecChooseHashTableSize(rows, outerNode->plan_width,
                            OidIsValid(node->skewTable),
                            state->parallel_state != NULL,
                            state->parallel_state != NULL ?
                            state->parallel_state->nparticipants - 1 : 0,
                            &space_allowed,
                            &nbuckets, &nbatch, &num_skew_mcvs);
 
    /* nbuckets must be a power of 2 */
    log2_nbuckets = pg_ceil_log2_32(nbuckets);
    Assert(nbuckets == (1 << log2_nbuckets));
 
    /*
     * Initialize the hash table control block.
     *
     * The hashtable control block is just palloc'd from the executor's
     * per-query memory context.  Everything else should be kept inside the
     * subsidiary hashCxt, batchCxt or spillCxt.
     */
    hashtable = palloc_object(HashJoinTableData);
    hashtable->nbuckets = nbuckets;
    hashtable->nbuckets_original = nbuckets;
    hashtable->nbuckets_optimal = nbuckets;
    hashtable->log2_nbuckets = log2_nbuckets;
    hashtable->log2_nbuckets_optimal = log2_nbuckets;
    hashtable->buckets.unshared = NULL;
    hashtable->skewEnabled = false;
    hashtable->skewBucket = NULL;
    hashtable->skewBucketLen = 0;
    hashtable->nSkewBuckets = 0;
    hashtable->skewBucketNums = NULL;
    hashtable->nbatch = nbatch;
    hashtable->curbatch = 0;
    hashtable->nbatch_original = nbatch;
    hashtable->nbatch_outstart = nbatch;
    hashtable->growEnabled = true;
    hashtable->totalTuples = 0;
    hashtable->partialTuples = 0;
    hashtable->skewTuples = 0;
    hashtable->innerBatchFile = NULL;
    hashtable->outerBatchFile = NULL;
    hashtable->spaceUsed = 0;
    hashtable->spacePeak = 0;
    hashtable->spaceAllowed = space_allowed;
    hashtable->spaceUsedSkew = 0;
    hashtable->spaceAllowedSkew =
        hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
    hashtable->chunks = NULL;
    hashtable->current_chunk = NULL;
    hashtable->parallel_state = state->parallel_state;
    hashtable->area = state->ps.state->es_query_dsa;
    hashtable->batches = NULL;
 
#ifdef HJDEBUG
    printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
           hashtable, nbatch, nbuckets);
#endif
 
    /*
     * Create temporary memory contexts in which to keep the hashtable working
     * storage.  See notes in executor/hashjoin.h.
     */
    hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
                                               "HashTableContext",
                                               ALLOCSET_DEFAULT_SIZES);
 
    hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
                                                "HashBatchContext",
                                                ALLOCSET_DEFAULT_SIZES);
 
    hashtable->spillCxt = AllocSetContextCreate(hashtable->hashCxt,
                                                "HashSpillContext",
                                                ALLOCSET_DEFAULT_SIZES);
 
    /* Allocate data that will live for the life of the hashjoin */
 
    oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 
    if (nbatch > 1 && hashtable->parallel_state == NULL)
    {
        MemoryContext oldctx;
 
        /*
         * allocate and initialize the file arrays in hashCxt (not needed for
         * parallel case which uses shared tuplestores instead of raw files)
         */
        oldctx = MemoryContextSwitchTo(hashtable->spillCxt);
 
        hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);
        hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);
 
        MemoryContextSwitchTo(oldctx);
 
        /* The files will not be opened until needed... */
        /* ... but make sure we have temp tablespaces established for them */
        PrepareTempTablespaces();
    }
 
    MemoryContextSwitchTo(oldcxt);
 
    if (hashtable->parallel_state)
    {
        ParallelHashJoinState *pstate = hashtable->parallel_state;
        Barrier    *build_barrier;
 
        /*
         * Attach to the build barrier.  The corresponding detach operation is
         * in ExecHashTableDetach.  Note that we won't attach to the
         * batch_barrier for batch 0 yet.  We'll attach later and start it out
         * in PHJ_BATCH_PROBE phase, because batch 0 is allocated up front and
         * then loaded while hashing (the standard hybrid hash join
         * algorithm), and we'll coordinate that using build_barrier.
         */
        build_barrier = &pstate->build_barrier;
        BarrierAttach(build_barrier);
 
        /*
         * So far we have no idea whether there are any other participants,
         * and if so, what phase they are working on.  The only thing we care
         * about at this point is whether someone has already created the
         * SharedHashJoinBatch objects and the hash table for batch 0.  One
         * backend will be elected to do that now if necessary.
         */
        if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECT &&
            BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT))
        {
            pstate->nbatch = nbatch;
            pstate->space_allowed = space_allowed;
            pstate->growth = PHJ_GROWTH_OK;
 
            /* Set up the shared state for coordinating batches. */
            ExecParallelHashJoinSetUpBatches(hashtable, nbatch);
 
            /*
             * Allocate batch 0's hash table up front so we can load it
             * directly while hashing.
             */
            pstate->nbuckets = nbuckets;
            ExecParallelHashTableAlloc(hashtable, 0);
        }
 
        /*
         * The next Parallel Hash synchronization point is in
         * MultiExecParallelHash(), which will progress it all the way to
         * PHJ_BUILD_RUN.  The caller must not return control from this
         * executor node between now and then.
         */
    }
    else
    {
        /*
         * Prepare context for the first-scan space allocations; allocate the
         * hashbucket array therein, and set each bucket "empty".
         */
        MemoryContextSwitchTo(hashtable->batchCxt);
 
        hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);
 
        /*
         * Set up for skew optimization, if possible and there's a need for
         * more than one batch.  (In a one-batch join, there's no point in
         * it.)
         */
        if (nbatch > 1)
            ExecHashBuildSkewHash(state, hashtable, node, num_skew_mcvs);
 
        MemoryContextSwitchTo(oldcxt);
    }
 
    return hashtable;
}

Referenced by ExecHashJoinImpl().

◆ ExecHashTableDestroy()

void ExecHashTableDestroy ( HashJoinTable hashtable )

Definition at line 955 of file nodeHash.c.

{
    int         i;
 
    /*
     * Make sure all the temp files are closed.  We skip batch 0, since it
     * can't have any temp files (and the arrays might not even exist if
     * nbatch is only 1).  Parallel hash joins don't use these files.
     */
    if (hashtable->innerBatchFile != NULL)
    {
        for (i = 1; i < hashtable->nbatch; i++)
        {
            if (hashtable->innerBatchFile[i])
                BufFileClose(hashtable->innerBatchFile[i]);
            if (hashtable->outerBatchFile[i])
                BufFileClose(hashtable->outerBatchFile[i]);
        }
    }
 
    /* Release working memory (batchCxt is a child, so it goes away too) */
    MemoryContextDelete(hashtable->hashCxt);
 
    /* And drop the control block */
    pfree(hashtable);
}

References BufFileClose(), HashJoinTableData::hashCxt, i, HashJoinTableData::innerBatchFile, MemoryContextDelete(), HashJoinTableData::nbatch, HashJoinTableData::outerBatchFile, and pfree().

Referenced by ExecEndHashJoin(), and ExecReScanHashJoin().

◆ ExecHashTableDetach()

void ExecHashTableDetach ( HashJoinTable hashtable )

Definition at line 3400 of file nodeHash.c.

{
    ParallelHashJoinState *pstate = hashtable->parallel_state;
 
    /*
     * If we're involved in a parallel query, we must either have gotten all
     * the way to PHJ_BUILD_RUN, or joined too late and be in PHJ_BUILD_FREE.
     */
    Assert(!pstate ||
           BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUN);
 
    if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUN)
    {
        int         i;
 
        /* Make sure any temporary files are closed. */
        if (hashtable->batches)
        {
            for (i = 0; i < hashtable->nbatch; ++i)
            {
                sts_end_write(hashtable->batches[i].inner_tuples);
                sts_end_write(hashtable->batches[i].outer_tuples);
                sts_end_parallel_scan(hashtable->batches[i].inner_tuples);
                sts_end_parallel_scan(hashtable->batches[i].outer_tuples);
            }
        }
 
        /* If we're last to detach, clean up shared memory. */
        if (BarrierArriveAndDetach(&pstate->build_barrier))
        {
            /*
             * Late joining processes will see this state and give up
             * immediately.
             */
            Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_FREE);
 
            if (DsaPointerIsValid(pstate->batches))
            {
                dsa_free(hashtable->area, pstate->batches);
                pstate->batches = InvalidDsaPointer;
            }
        }
    }
    hashtable->parallel_state = NULL;
}

References HashJoinTableData::area, Assert(), BarrierArriveAndDetach(), BarrierPhase(), ParallelHashJoinState::batches, HashJoinTableData::batches, ParallelHashJoinState::build_barrier, dsa_free(), DsaPointerIsValid, i, ParallelHashJoinBatchAccessor::inner_tuples, InvalidDsaPointer, HashJoinTableData::nbatch, ParallelHashJoinBatchAccessor::outer_tuples, HashJoinTableData::parallel_state, PHJ_BUILD_FREE, PHJ_BUILD_RUN, sts_end_parallel_scan(), and sts_end_write().

Referenced by ExecHashJoinReInitializeDSM(), and ExecShutdownHashJoin().

◆ ExecHashTableDetachBatch()

void ExecHashTableDetachBatch ( HashJoinTable hashtable )

Definition at line 3308 of file nodeHash.c.

{
    if (hashtable->parallel_state != NULL &&
        hashtable->curbatch >= 0)
    {
        int         curbatch = hashtable->curbatch;
        ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
        bool        attached = true;
 
        /* Make sure any temporary files are closed. */
        sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
        sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
 
        /* After attaching we always get at least to PHJ_BATCH_PROBE. */
        Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE ||
               BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
 
        /*
         * If we're abandoning the PHJ_BATCH_PROBE phase early without having
         * reached the end of it, it means the plan doesn't want any more
         * tuples, and it is happy to abandon any tuples buffered in this
         * process's subplans.  For correctness, we can't allow any process to
         * execute the PHJ_BATCH_SCAN phase, because we will never have the
         * complete set of match bits.  Therefore we skip emitting unmatched
         * tuples in all backends (if this is a full/right join), as if those
         * tuples were all due to be emitted by this process and it has
         * abandoned them too.
         */
        if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE &&
            !hashtable->batches[curbatch].outer_eof)
        {
            /*
             * This flag may be written to by multiple backends during
             * PHJ_BATCH_PROBE phase, but will only be read in PHJ_BATCH_SCAN
             * phase so requires no extra locking.
             */
            batch->skip_unmatched = true;
        }
 
        /*
         * Even if we aren't doing a full/right outer join, we'll step through
         * the PHJ_BATCH_SCAN phase just to maintain the invariant that
         * freeing happens in PHJ_BATCH_FREE, but that'll be wait-free.
         */
        if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE)
            attached = BarrierArriveAndDetachExceptLast(&batch->batch_barrier);
        if (attached && BarrierArriveAndDetach(&batch->batch_barrier))
        {
            /*
             * We are not longer attached to the batch barrier, but we're the
             * process that was chosen to free resources and it's safe to
             * assert the current phase.  The ParallelHashJoinBatch can't go
             * away underneath us while we are attached to the build barrier,
             * making this access safe.
             */
            Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_FREE);
 
            /* Free shared chunks and buckets. */
            while (DsaPointerIsValid(batch->chunks))
            {
                HashMemoryChunk chunk =
                    dsa_get_address(hashtable->area, batch->chunks);
                dsa_pointer next = chunk->next.shared;
 
                dsa_free(hashtable->area, batch->chunks);
                batch->chunks = next;
            }
            if (DsaPointerIsValid(batch->buckets))
            {
                dsa_free(hashtable->area, batch->buckets);
                batch->buckets = InvalidDsaPointer;
            }
        }
 
        /*
         * Track the largest batch we've been attached to.  Though each
         * backend might see a different subset of batches, explain.c will
         * scan the results from all backends to find the largest value.
         */
        hashtable->spacePeak =
            Max(hashtable->spacePeak,
                batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
 
        /* Remember that we are not attached to a batch. */
        hashtable->curbatch = -1;
    }
}

Referenced by ExecHashJoinReInitializeDSM(), ExecParallelHashJoinNewBatch(), ExecParallelPrepHashTableForUnmatched(), and ExecShutdownHashJoin().

◆ ExecHashTableInsert()

void ExecHashTableInsert	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1748 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    int         bucketno;
    int         batchno;
 
    ExecHashGetBucketAndBatch(hashtable, hashvalue,
                              &bucketno, &batchno);
 
    /*
     * decide whether to put the tuple in the hash table or a temp file
     */
    if (batchno == hashtable->curbatch)
    {
        /*
         * put the tuple in hash table
         */
        HashJoinTuple hashTuple;
        int         hashTupleSize;
        double      ntuples = (hashtable->totalTuples - hashtable->skewTuples);
 
        /* Create the HashJoinTuple */
        hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
        hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);
 
        hashTuple->hashvalue = hashvalue;
        memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 
        /*
         * We always reset the tuple-matched flag on insertion.  This is okay
         * even when reloading a tuple from a batch file, since the tuple
         * could not possibly have been matched to an outer tuple before it
         * went into the batch file.
         */
        HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
        /* Push it onto the front of the bucket's list */
        hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];
        hashtable->buckets.unshared[bucketno] = hashTuple;
 
        /*
         * Increase the (optimal) number of buckets if we just exceeded the
         * NTUP_PER_BUCKET threshold, but only when there's still a single
         * batch.
         */
        if (hashtable->nbatch == 1 &&
            ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))
        {
            /* Guard against integer overflow and alloc size overflow */
            if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&
                hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))
            {
                hashtable->nbuckets_optimal *= 2;
                hashtable->log2_nbuckets_optimal += 1;
            }
        }
 
        /* Account for space used, and back off if we've used too much */
        hashtable->spaceUsed += hashTupleSize;
        if (hashtable->spaceUsed > hashtable->spacePeak)
            hashtable->spacePeak = hashtable->spaceUsed;
        if (hashtable->spaceUsed +
            hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
            > hashtable->spaceAllowed)
            ExecHashIncreaseNumBatches(hashtable);
    }
    else
    {
        /*
         * put the tuple into a temp file for later batches
         */
        Assert(batchno > hashtable->curbatch);
        ExecHashJoinSaveTuple(tuple,
                              hashvalue,
                              &hashtable->innerBatchFile[batchno],
                              hashtable);
    }
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

Referenced by ExecHashJoinNewBatch(), and MultiExecPrivateHash().

◆ ExecHashTableReset()

void ExecHashTableReset ( HashJoinTable hashtable )

Definition at line 2326 of file nodeHash.c.

{
    MemoryContext oldcxt;
    int         nbuckets = hashtable->nbuckets;
 
    /*
     * Release all the hash buckets and tuples acquired in the prior pass, and
     * reinitialize the context for a new pass.
     */
    MemoryContextReset(hashtable->batchCxt);
    oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);
 
    /* Reallocate and reinitialize the hash bucket headers. */
    hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);
 
    hashtable->spaceUsed = 0;
 
    MemoryContextSwitchTo(oldcxt);
 
    /* Forget the chunks (the memory was freed by the context reset above). */
    hashtable->chunks = NULL;
}

References HashJoinTableData::batchCxt, HashJoinTableData::buckets, HashJoinTableData::chunks, MemoryContextReset(), MemoryContextSwitchTo(), HashJoinTableData::nbuckets, palloc0_array, HashJoinTableData::spaceUsed, and HashJoinTableData::unshared.

Referenced by ExecHashJoinNewBatch().

◆ ExecHashTableResetMatchFlags()

void ExecHashTableResetMatchFlags ( HashJoinTable hashtable )

Definition at line 2354 of file nodeHash.c.

{
    HashJoinTuple tuple;
    int         i;
 
    /* Reset all flags in the main table ... */
    for (i = 0; i < hashtable->nbuckets; i++)
    {
        for (tuple = hashtable->buckets.unshared[i]; tuple != NULL;
             tuple = tuple->next.unshared)
            HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
    }
 
    /* ... and the same for the skew buckets, if any */
    for (i = 0; i < hashtable->nSkewBuckets; i++)
    {
        int         j = hashtable->skewBucketNums[i];
        HashSkewBucket *skewBucket = hashtable->skewBucket[j];
 
        for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared)
            HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));
    }
}

References HashJoinTableData::buckets, HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, i, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecReScanHashJoin().

◆ ExecInitHash()

HashState * ExecInitHash	(	Hash *	node,
		EState *	estate,
		int	eflags
	)

Definition at line 369 of file nodeHash.c.

{
    HashState  *hashstate;
 
    /* check for unsupported flags */
    Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
 
    /*
     * create state structure
     */
    hashstate = makeNode(HashState);
    hashstate->ps.plan = (Plan *) node;
    hashstate->ps.state = estate;
    hashstate->ps.ExecProcNode = ExecHash;
    /* delay building hashtable until ExecHashTableCreate() in executor run */
    hashstate->hashtable = NULL;
 
    /*
     * Miscellaneous initialization
     *
     * create expression context for node
     */
    ExecAssignExprContext(estate, &hashstate->ps);
 
    /*
     * initialize child nodes
     */
    outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);
 
    /*
     * initialize our result slot and type. No need to build projection
     * because this node doesn't do projections.
     */
    ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple);
    hashstate->ps.ps_ProjInfo = NULL;
 
    Assert(node->plan.qual == NIL);
 
    /*
     * Delay initialization of hash_expr until ExecInitHashJoin().  We cannot
     * build the ExprState here as we don't yet know the join type we're going
     * to be hashing values for and we need to know that before calling
     * ExecBuildHash32Expr as the keep_nulls parameter depends on the join
     * type.
     */
    hashstate->hash_expr = NULL;
 
    return hashstate;
}

References Assert(), EXEC_FLAG_BACKWARD, EXEC_FLAG_MARK, ExecAssignExprContext(), ExecHash(), ExecInitNode(), ExecInitResultTupleSlotTL(), PlanState::ExecProcNode, HashState::hash_expr, HashState::hashtable, makeNode, NIL, outerPlan, outerPlanState, PlanState::plan, Hash::plan, HashState::ps, PlanState::ps_ProjInfo, Plan::qual, PlanState::state, and TTSOpsMinimalTuple.

Referenced by ExecInitNode().

◆ ExecParallelHashTableAlloc()

void ExecParallelHashTableAlloc	(	HashJoinTable	hashtable,
		int	batchno
	)

Definition at line 3288 of file nodeHash.c.

{
    ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared;
    dsa_pointer_atomic *buckets;
    int         nbuckets = hashtable->parallel_state->nbuckets;
    int         i;
 
    batch->buckets =
        dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets);
    buckets = (dsa_pointer_atomic *)
        dsa_get_address(hashtable->area, batch->buckets);
    for (i = 0; i < nbuckets; ++i)
        dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);
}

References HashJoinTableData::area, HashJoinTableData::batches, ParallelHashJoinBatch::buckets, dsa_allocate, dsa_get_address(), dsa_pointer_atomic_init, i, InvalidDsaPointer, ParallelHashJoinState::nbuckets, HashJoinTableData::parallel_state, and ParallelHashJoinBatchAccessor::shared.

Referenced by ExecHashTableCreate(), and ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableInsert()

void ExecParallelHashTableInsert	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1838 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    dsa_pointer shared;
    int         bucketno;
    int         batchno;
 
retry:
    ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
 
    if (batchno == 0)
    {
        HashJoinTuple hashTuple;
 
        /* Try to load it into memory. */
        Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) ==
               PHJ_BUILD_HASH_INNER);
        hashTuple = ExecParallelHashTupleAlloc(hashtable,
                                               HJTUPLE_OVERHEAD + tuple->t_len,
                                               &shared);
        if (hashTuple == NULL)
            goto retry;
 
        /* Store the hash value in the HashJoinTuple header. */
        hashTuple->hashvalue = hashvalue;
        memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
        HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 
        /* Push it onto the front of the bucket's list */
        ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                                  hashTuple, shared);
    }
    else
    {
        size_t      tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);
 
        Assert(batchno > 0);
 
        /* Try to preallocate space in the batch if necessary. */
        if (hashtable->batches[batchno].preallocated < tuple_size)
        {
            if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size))
                goto retry;
        }
 
        Assert(hashtable->batches[batchno].preallocated >= tuple_size);
        hashtable->batches[batchno].preallocated -= tuple_size;
        sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,
                     tuple);
    }
    ++hashtable->batches[batchno].ntuples;
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

References Assert(), BarrierPhase(), HashJoinTableData::batches, HashJoinTableData::buckets, ParallelHashJoinState::build_barrier, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), ExecParallelHashTuplePrealloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, ParallelHashJoinBatchAccessor::inner_tuples, MAXALIGN, ParallelHashJoinBatchAccessor::ntuples, HashJoinTableData::parallel_state, PHJ_BUILD_HASH_INNER, ParallelHashJoinBatchAccessor::preallocated, HashJoinTableData::shared, sts_puttuple(), and MinimalTupleData::t_len.

Referenced by MultiExecParallelHash().

◆ ExecParallelHashTableInsertCurrentBatch()

void ExecParallelHashTableInsertCurrentBatch	(	HashJoinTable	hashtable,
		TupleTableSlot *	slot,
		uint32	hashvalue
	)

Definition at line 1904 of file nodeHash.c.

{
    bool        shouldFree;
    MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
    HashJoinTuple hashTuple;
    dsa_pointer shared;
    int         batchno;
    int         bucketno;
 
    ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
    Assert(batchno == hashtable->curbatch);
    hashTuple = ExecParallelHashTupleAlloc(hashtable,
                                           HJTUPLE_OVERHEAD + tuple->t_len,
                                           &shared);
    hashTuple->hashvalue = hashvalue;
    memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
    HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
    ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
                              hashTuple, shared);
 
    if (shouldFree)
        heap_free_minimal_tuple(tuple);
}

References Assert(), HashJoinTableData::buckets, HashJoinTableData::curbatch, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, HashJoinTableData::shared, and MinimalTupleData::t_len.

Referenced by ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableSetCurrentBatch()

void ExecParallelHashTableSetCurrentBatch	(	HashJoinTable	hashtable,
		int	batchno
	)

Definition at line 3498 of file nodeHash.c.

{
    Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer);
 
    hashtable->curbatch = batchno;
    hashtable->buckets.shared = (dsa_pointer_atomic *)
        dsa_get_address(hashtable->area,
                        hashtable->batches[batchno].shared->buckets);
    hashtable->nbuckets = hashtable->parallel_state->nbuckets;
    hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);
    hashtable->current_chunk = NULL;
    hashtable->current_chunk_shared = InvalidDsaPointer;
    hashtable->batches[batchno].at_least_one_chunk = false;
}

Referenced by ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinNewBatch(), and MultiExecParallelHash().

◆ ExecParallelPrepHashTableForUnmatched()

bool ExecParallelPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2124 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    int         curbatch = hashtable->curbatch;
    ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
 
    Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE);
 
    /*
     * It would not be deadlock-free to wait on the batch barrier, because it
     * is in PHJ_BATCH_PROBE phase, and thus processes attached to it have
     * already emitted tuples.  Therefore, we'll hold a wait-free election:
     * only one process can continue to the next phase, and all others detach
     * from this batch.  They can still go any work on other batches, if there
     * are any.
     */
    if (!BarrierArriveAndDetachExceptLast(&batch->batch_barrier))
    {
        /* This process considers the batch to be done. */
        hashtable->batches[hashtable->curbatch].done = true;
 
        /* Make sure any temporary files are closed. */
        sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
        sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
 
        /*
         * Track largest batch we've seen, which would normally happen in
         * ExecHashTableDetachBatch().
         */
        hashtable->spacePeak =
            Max(hashtable->spacePeak,
                batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
        hashtable->curbatch = -1;
        return false;
    }
 
    /* Now we are alone with this batch. */
    Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
 
    /*
     * Has another process decided to give up early and command all processes
     * to skip the unmatched scan?
     */
    if (batch->skip_unmatched)
    {
        hashtable->batches[hashtable->curbatch].done = true;
        ExecHashTableDetachBatch(hashtable);
        return false;
    }
 
    /* Now prepare the process local state, just as for non-parallel join. */
    ExecPrepHashTableForUnmatched(hjstate);
 
    return true;
}

References Assert(), BarrierArriveAndDetachExceptLast(), BarrierPhase(), ParallelHashJoinBatch::batch_barrier, HashJoinTableData::batches, HashJoinTableData::curbatch, ParallelHashJoinBatchAccessor::done, ExecHashTableDetachBatch(), ExecPrepHashTableForUnmatched(), HashJoinState::hj_HashTable, ParallelHashJoinBatchAccessor::inner_tuples, Max, HashJoinTableData::nbuckets, ParallelHashJoinBatchAccessor::outer_tuples, PHJ_BATCH_PROBE, PHJ_BATCH_SCAN, ParallelHashJoinBatchAccessor::shared, ParallelHashJoinBatch::size, ParallelHashJoinBatch::skip_unmatched, HashJoinTableData::spacePeak, and sts_end_parallel_scan().

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashBucket()

bool ExecParallelScanHashBucket	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2052 of file nodeHash.c.

{
    ExprState  *hjclauses = hjstate->hashclauses;
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
    uint32      hashvalue = hjstate->hj_CurHashValue;
 
    /*
     * hj_CurTuple is the address of the tuple last returned from the current
     * bucket, or NULL if it's time to start scanning a new bucket.
     */
    if (hashTuple != NULL)
        hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
    else
        hashTuple = ExecParallelHashFirstTuple(hashtable,
                                               hjstate->hj_CurBucketNo);
 
    while (hashTuple != NULL)
    {
        if (hashTuple->hashvalue == hashvalue)
        {
            TupleTableSlot *inntuple;
 
            /* insert hashtable's tuple into exec slot so ExecQual sees it */
            inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                             hjstate->hj_HashTupleSlot,
                                             false);    /* do not pfree */
            econtext->ecxt_innertuple = inntuple;
 
            if (ExecQualAndReset(hjclauses, econtext))
            {
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
        }
 
        hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
    }
 
    /*
     * no match
     */
    return false;
}

References ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, and HJTUPLE_MINTUPLE.

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashTableForUnmatched()

bool ExecParallelScanHashTableForUnmatched	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2263 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 
    for (;;)
    {
        /*
         * hj_CurTuple is the address of the tuple last returned from the
         * current bucket, or NULL if it's time to start scanning a new
         * bucket.
         */
        if (hashTuple != NULL)
            hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
        else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
            hashTuple = ExecParallelHashFirstTuple(hashtable,
                                                   hjstate->hj_CurBucketNo++);
        else
            break;              /* finished all buckets */
 
        while (hashTuple != NULL)
        {
            if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
            {
                TupleTableSlot *inntuple;
 
                /* insert hashtable's tuple into exec slot */
                inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                                 hjstate->hj_HashTupleSlot,
                                                 false);    /* do not pfree */
                econtext->ecxt_innertuple = inntuple;
 
                /*
                 * Reset temp memory each time; although this function doesn't
                 * do any qual eval, the caller will, so let's keep it
                 * parallel to ExecScanHashBucket.
                 */
                ResetExprContext(econtext);
 
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
 
            hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
        }
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
 
    /*
     * no more unmatched tuples
     */
    return false;
}

References CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, HashJoinTableData::nbuckets, and ResetExprContext.

Referenced by ExecHashJoinImpl().

◆ ExecPrepHashTableForUnmatched()

void ExecPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2103 of file nodeHash.c.

{
    /*----------
     * During this scan we use the HashJoinState fields as follows:
     *
     * hj_CurBucketNo: next regular bucket to scan
     * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)
     * hj_CurTuple: last tuple returned, or NULL to start next bucket
     *----------
     */
    hjstate->hj_CurBucketNo = 0;
    hjstate->hj_CurSkewBucketNo = 0;
    hjstate->hj_CurTuple = NULL;
}

References HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, and HashJoinState::hj_CurTuple.

Referenced by ExecHashJoinImpl(), and ExecParallelPrepHashTableForUnmatched().

◆ ExecReScanHash()

void ExecReScanHash ( HashState * node )

Definition at line 2380 of file nodeHash.c.

{
    PlanState  *outerPlan = outerPlanState(node);
 
    /*
     * if chgParam of subnode is not null then plan will be re-scanned by
     * first ExecProcNode.
     */
    if (outerPlan->chgParam == NULL)
        ExecReScan(outerPlan);
}

References ExecReScan(), outerPlan, and outerPlanState.

Referenced by ExecReScan().

◆ ExecScanHashBucket()

bool ExecScanHashBucket	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 1991 of file nodeHash.c.

{
    ExprState  *hjclauses = hjstate->hashclauses;
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
    uint32      hashvalue = hjstate->hj_CurHashValue;
 
    /*
     * hj_CurTuple is the address of the tuple last returned from the current
     * bucket, or NULL if it's time to start scanning a new bucket.
     *
     * If the tuple hashed to a skew bucket then scan the skew bucket
     * otherwise scan the standard hashtable bucket.
     */
    if (hashTuple != NULL)
        hashTuple = hashTuple->next.unshared;
    else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
        hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
    else
        hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
 
    while (hashTuple != NULL)
    {
        if (hashTuple->hashvalue == hashvalue)
        {
            TupleTableSlot *inntuple;
 
            /* insert hashtable's tuple into exec slot so ExecQual sees it */
            inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                             hjstate->hj_HashTupleSlot,
                                             false);    /* do not pfree */
            econtext->ecxt_innertuple = inntuple;
 
            if (ExecQualAndReset(hjclauses, econtext))
            {
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
        }
 
        hashTuple = hashTuple->next.unshared;
    }
 
    /*
     * no match
     */
    return false;
}

References HashJoinTableData::buckets, ExprContext::ecxt_innertuple, ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, INVALID_SKEW_BUCKET_NO, HashJoinTupleData::next, HashJoinTableData::skewBucket, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecScanHashTableForUnmatched()

bool ExecScanHashTableForUnmatched	(	HashJoinState *	hjstate,
		ExprContext *	econtext
	)

Definition at line 2189 of file nodeHash.c.

{
    HashJoinTable hashtable = hjstate->hj_HashTable;
    HashJoinTuple hashTuple = hjstate->hj_CurTuple;
 
    for (;;)
    {
        /*
         * hj_CurTuple is the address of the tuple last returned from the
         * current bucket, or NULL if it's time to start scanning a new
         * bucket.
         */
        if (hashTuple != NULL)
            hashTuple = hashTuple->next.unshared;
        else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
        {
            hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];
            hjstate->hj_CurBucketNo++;
        }
        else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
        {
            int         j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
 
            hashTuple = hashtable->skewBucket[j]->tuples;
            hjstate->hj_CurSkewBucketNo++;
        }
        else
            break;              /* finished all buckets */
 
        while (hashTuple != NULL)
        {
            if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
            {
                TupleTableSlot *inntuple;
 
                /* insert hashtable's tuple into exec slot */
                inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
                                                 hjstate->hj_HashTupleSlot,
                                                 false);    /* do not pfree */
                econtext->ecxt_innertuple = inntuple;
 
                /*
                 * Reset temp memory each time; although this function doesn't
                 * do any qual eval, the caller will, so let's keep it
                 * parallel to ExecScanHashBucket.
                 */
                ResetExprContext(econtext);
 
                hjstate->hj_CurTuple = hashTuple;
                return true;
            }
 
            hashTuple = hashTuple->next.unshared;
        }
 
        /* allow this loop to be cancellable */
        CHECK_FOR_INTERRUPTS();
    }
 
    /*
     * no more unmatched tuples
     */
    return false;
}

References HashJoinTableData::buckets, CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, ResetExprContext, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecShutdownHash()

void ExecShutdownHash ( HashState * node )

Definition at line 2830 of file nodeHash.c.

{
    /* Allocate save space if EXPLAIN'ing and we didn't do so already */
    if (node->ps.instrument && !node->hinstrument)
        node->hinstrument = palloc0_object(HashInstrumentation);
    /* Now accumulate data for the current (final) hash table */
    if (node->hinstrument && node->hashtable)
        ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);
}

References ExecHashAccumInstrumentation(), HashState::hashtable, HashState::hinstrument, PlanState::instrument, palloc0_object, and HashState::ps.

Referenced by ExecShutdownNode_walker().

◆ MultiExecHash()

Node * MultiExecHash ( HashState * node )

Definition at line 104 of file nodeHash.c.

{
    /* must provide our own instrumentation support */
    if (node->ps.instrument)
        InstrStartNode(node->ps.instrument);
 
    if (node->parallel_state != NULL)
        MultiExecParallelHash(node);
    else
        MultiExecPrivateHash(node);
 
    /* must provide our own instrumentation support */
    if (node->ps.instrument)
        InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);
 
    /*
     * We do not return the hash table directly because it's not a subtype of
     * Node, and so would violate the MultiExecProcNode API.  Instead, our
     * parent Hashjoin node is expected to know how to fish it out of our node
     * state.  Ugly but not really worth cleaning up, since Hashjoin knows
     * quite a bit more about Hash besides that.
     */
    return NULL;
}

References HashState::hashtable, InstrStartNode(), InstrStopNode(), PlanState::instrument, MultiExecParallelHash(), MultiExecPrivateHash(), HashState::parallel_state, HashJoinTableData::partialTuples, and HashState::ps.

Referenced by MultiExecProcNode().

Functions

Function Documentation

◆ ExecChooseHashTableSize()

◆ ExecEndHash()

◆ ExecHashAccumInstrumentation()

◆ ExecHashEstimate()

◆ ExecHashGetBucketAndBatch()

◆ ExecHashGetSkewBucket()

◆ ExecHashInitializeDSM()

◆ ExecHashInitializeWorker()

◆ ExecHashRetrieveInstrumentation()

◆ ExecHashTableCreate()

◆ ExecHashTableDestroy()

◆ ExecHashTableDetach()

◆ ExecHashTableDetachBatch()

◆ ExecHashTableInsert()

◆ ExecHashTableReset()

◆ ExecHashTableResetMatchFlags()

◆ ExecInitHash()

◆ ExecParallelHashTableAlloc()

◆ ExecParallelHashTableInsert()

◆ ExecParallelHashTableInsertCurrentBatch()

◆ ExecParallelHashTableSetCurrentBatch()

◆ ExecParallelPrepHashTableForUnmatched()

◆ ExecParallelScanHashBucket()

◆ ExecParallelScanHashTableForUnmatched()

◆ ExecPrepHashTableForUnmatched()

◆ ExecReScanHash()

◆ ExecScanHashBucket()

◆ ExecScanHashTableForUnmatched()

◆ ExecShutdownHash()

◆ MultiExecHash()