PostgreSQL Source Code git master
aio.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio.c
4 * AIO - Core Logic
5 *
6 * For documentation about how AIO works on a higher level, including a
7 * schematic example, see README.md.
8 *
9 *
10 * AIO is a complicated subsystem. To keep things navigable, it is split
11 * across a number of files:
12 *
13 * - method_*.c - different ways of executing AIO (e.g. worker process)
14 *
15 * - aio_target.c - IO on different kinds of targets
16 *
17 * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 *
19 * - aio_callback.c - callbacks at IO operation lifecycle events
20 *
21 * - aio_init.c - per-server and per-backend initialization
22 *
23 * - aio.c - all other topics
24 *
25 * - read_stream.c - helper for reading buffered relation data
26 *
27 * - README.md - higher-level overview over AIO
28 *
29 *
30 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 * Portions Copyright (c) 1994, Regents of the University of California
32 *
33 * IDENTIFICATION
34 * src/backend/storage/aio/aio.c
35 *
36 *-------------------------------------------------------------------------
37 */
38
39#include "postgres.h"
40
41#include "lib/ilist.h"
42#include "miscadmin.h"
43#include "port/atomics.h"
44#include "storage/aio.h"
46#include "storage/aio_subsys.h"
47#include "utils/guc.h"
48#include "utils/guc_hooks.h"
50#include "utils/resowner.h"
51#include "utils/wait_event_types.h"
52
53
54static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55static void pgaio_io_reclaim(PgAioHandle *ioh);
57static void pgaio_io_wait_for_free(void);
58static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59static const char *pgaio_io_state_get_name(PgAioHandleState s);
60static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61
62
63/* Options for io_method. */
65 {"sync", IOMETHOD_SYNC, false},
66 {"worker", IOMETHOD_WORKER, false},
67#ifdef IOMETHOD_IO_URING_ENABLED
68 {"io_uring", IOMETHOD_IO_URING, false},
69#endif
70 {NULL, 0, false}
71};
72
73/* GUCs */
76
77/* global control for AIO */
79
80/* current backend's per-backend state */
82
83
84static const IoMethodOps *const pgaio_method_ops_table[] = {
87#ifdef IOMETHOD_IO_URING_ENABLED
88 [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89#endif
90};
91
93 "io_method_options out of sync with pgaio_method_ops_table");
94
95/* callbacks for the configured io_method, set by assign_io_method */
97
98
99/* --------------------------------------------------------------------------------
100 * Public Functions related to PgAioHandle
101 * --------------------------------------------------------------------------------
102 */
103
104/*
105 * Acquire an AioHandle, waiting for IO completion if necessary.
106 *
107 * Each backend can only have one AIO handle that has been "handed out" to
108 * code, but not yet submitted or released. This restriction is necessary to
109 * ensure that it is possible for code to wait for an unused handle by waiting
110 * for in-flight IO to complete. There is a limited number of handles in each
111 * backend, if multiple handles could be handed out without being submitted,
112 * waiting for all in-flight IO to complete would not guarantee that handles
113 * free up.
114 *
115 * It is cheap to acquire an IO handle, unless all handles are in use. In that
116 * case this function waits for the oldest IO to complete. If that is not
117 * desirable, use pgaio_io_acquire_nb().
118 *
119 * If a handle was acquired but then does not turn out to be needed,
120 * e.g. because pgaio_io_acquire() is called before starting an IO in a
121 * critical section, the handle needs to be released with pgaio_io_release().
122 *
123 *
124 * To react to the completion of the IO as soon as it is known to have
125 * completed, callbacks can be registered with pgaio_io_register_callbacks().
126 *
127 * To actually execute IO using the returned handle, the pgaio_io_start_*()
128 * family of functions is used. In many cases the pgaio_io_start_*() call will
129 * not be done directly by code that acquired the handle, but by lower level
130 * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
131 * AIO, it typically will pass the handle to smgr.c, which will pass it on to
132 * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
133 * forwarding allows the various layers to react to the IO's completion by
134 * registering callbacks. These callbacks in turn can translate a lower
135 * layer's result into a result understandable by a higher layer.
136 *
137 * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
138 * not submitted to the kernel). Unless in batchmode
139 * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
140 * execution. Note that, whether in batchmode or not, the IO might even
141 * complete before the functions return.
142 *
143 * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
144 * referenced by the IO issuing code. To e.g. wait for IO, references to the
145 * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
146 * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
147 *
148 *
149 * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
150 * passed to pgaio_io_acquire(). Once the issuing backend has called
151 * pgaio_wref_wait(), the PgAioReturn contains information about whether the
152 * operation succeeded and details about the first failure, if any. The error
153 * can be raised / logged with pgaio_result_report().
154 *
155 * The lifetime of the memory pointed to be *ret needs to be at least as long
156 * as the passed in resowner. If the resowner releases resources before the IO
157 * completes (typically due to an error), the reference to *ret will be
158 * cleared. In case of resowner cleanup *ret will not be updated with the
159 * results of the IO operation.
160 */
163{
164 PgAioHandle *h;
165
166 while (true)
167 {
168 h = pgaio_io_acquire_nb(resowner, ret);
169
170 if (h != NULL)
171 return h;
172
173 /*
174 * Evidently all handles by this backend are in use. Just wait for
175 * some to complete.
176 */
178 }
179}
180
181/*
182 * Acquire an AioHandle, returning NULL if no handles are free.
183 *
184 * See pgaio_io_acquire(). The only difference is that this function will return
185 * NULL if there are no idle handles, instead of blocking.
186 */
189{
190 PgAioHandle *ioh = NULL;
191
193 {
196 }
197
199 elog(ERROR, "API violation: Only one IO can be handed out");
200
201 /*
202 * Probably not needed today, as interrupts should not process this IO,
203 * but...
204 */
206
208 {
210
211 ioh = dclist_container(PgAioHandle, node, ion);
212
213 Assert(ioh->state == PGAIO_HS_IDLE);
215
218
219 if (resowner)
221
222 if (ret)
223 {
224 ioh->report_return = ret;
226 }
227 }
228
230
231 return ioh;
232}
233
234/*
235 * Release IO handle that turned out to not be required.
236 *
237 * See pgaio_io_acquire() for more details.
238 */
239void
241{
243 {
245 Assert(ioh->resowner);
246
248
249 /*
250 * Note that no interrupts are processed between the handed_out_io
251 * check and the call to reclaim - that's important as otherwise an
252 * interrupt could have already reclaimed the handle.
253 */
254 pgaio_io_reclaim(ioh);
255 }
256 else
257 {
258 elog(ERROR, "release in unexpected state");
259 }
260}
261
262/*
263 * Release IO handle during resource owner cleanup.
264 */
265void
266pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
267{
268 PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
269
270 Assert(ioh->resowner);
271
272 /*
273 * Otherwise an interrupt, in the middle of releasing the IO, could end up
274 * trying to wait for the IO, leading to state confusion.
275 */
277
279 ioh->resowner = NULL;
280
281 switch ((PgAioHandleState) ioh->state)
282 {
283 case PGAIO_HS_IDLE:
284 elog(ERROR, "unexpected");
285 break;
288
290 {
292 if (!on_error)
293 elog(WARNING, "leaked AIO handle");
294 }
295
296 pgaio_io_reclaim(ioh);
297 break;
298 case PGAIO_HS_DEFINED:
299 case PGAIO_HS_STAGED:
300 if (!on_error)
301 elog(WARNING, "AIO handle was not submitted");
303 break;
308 /* this is expected to happen */
309 break;
310 }
311
312 /*
313 * Need to unregister the reporting of the IO's result, the memory it's
314 * referencing likely has gone away.
315 */
316 if (ioh->report_return)
317 ioh->report_return = NULL;
318
320}
321
322/*
323 * Add a [set of] flags to the IO.
324 *
325 * Note that this combines flags with already set flags, rather than set flags
326 * to explicitly the passed in parameters. This is to allow multiple callsites
327 * to set flags.
328 */
329void
331{
333
334 ioh->flags |= flag;
335}
336
337/*
338 * Returns an ID uniquely identifying the IO handle. This is only really
339 * useful for logging, as handles are reused across multiple IOs.
340 */
341int
343{
344 Assert(ioh >= pgaio_ctl->io_handles &&
346 return ioh - pgaio_ctl->io_handles;
347}
348
349/*
350 * Return the ProcNumber for the process that can use an IO handle. The
351 * mapping from IO handles to PGPROCs is static, therefore this even works
352 * when the corresponding PGPROC is not in use.
353 */
356{
357 return ioh->owner_procno;
358}
359
360/*
361 * Return a wait reference for the IO. Only wait references can be used to
362 * wait for an IOs completion, as handles themselves can be reused after
363 * completion. See also the comment above pgaio_io_acquire().
364 */
365void
367{
369 ioh->state == PGAIO_HS_DEFINED ||
370 ioh->state == PGAIO_HS_STAGED);
371 Assert(ioh->generation != 0);
372
373 iow->aio_index = ioh - pgaio_ctl->io_handles;
374 iow->generation_upper = (uint32) (ioh->generation >> 32);
375 iow->generation_lower = (uint32) ioh->generation;
376}
377
378
379
380/* --------------------------------------------------------------------------------
381 * Internal Functions related to PgAioHandle
382 * --------------------------------------------------------------------------------
383 */
384
385static inline void
387{
388 /*
389 * All callers need to have held interrupts in some form, otherwise
390 * interrupt processing could wait for the IO to complete, while in an
391 * intermediary state.
392 */
394
396 "updating state to %s",
397 pgaio_io_state_get_name(new_state));
398
399 /*
400 * Ensure the changes signified by the new state are visible before the
401 * new state becomes visible.
402 */
404
405 ioh->state = new_state;
406}
407
408static void
410{
411 Assert(!ioh->resowner);
413
416}
417
418/*
419 * Stage IO for execution and, if appropriate, submit it immediately.
420 *
421 * Should only be called from pgaio_io_start_*().
422 */
423void
425{
426 bool needs_synchronous;
427
431
432 /*
433 * Otherwise an interrupt, in the middle of staging and possibly executing
434 * the IO, could end up trying to wait for the IO, leading to state
435 * confusion.
436 */
438
439 ioh->op = op;
440 ioh->result = 0;
441
443
444 /* allow a new IO to be staged */
446
448
450
451 /*
452 * Synchronous execution has to be executed, well, synchronously, so check
453 * that first.
454 */
455 needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
456
458 "staged (synchronous: %d, in_batch: %d)",
459 needs_synchronous, pgaio_my_backend->in_batchmode);
460
461 if (!needs_synchronous)
462 {
465
466 /*
467 * Unless code explicitly opted into batching IOs, submit the IO
468 * immediately.
469 */
472 }
473 else
474 {
477 }
478
480}
481
482bool
484{
485 /*
486 * If the caller said to execute the IO synchronously, do so.
487 *
488 * XXX: We could optimize the logic when to execute synchronously by first
489 * checking if there are other IOs in flight and only synchronously
490 * executing if not. Unclear whether that'll be sufficiently common to be
491 * worth worrying about.
492 */
493 if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
494 return true;
495
496 /* Check if the IO method requires synchronous execution of IO */
499
500 return false;
501}
502
503/*
504 * Handle IO being processed by IO method.
505 *
506 * Should be called by IO methods / synchronous IO execution, just before the
507 * IO is performed.
508 */
509void
511{
513
515}
516
517/*
518 * Handle IO getting completed by a method.
519 *
520 * Should be called by IO methods / synchronous IO execution, just after the
521 * IO has been performed.
522 *
523 * Expects to be called in a critical section. We expect IOs to be usable for
524 * WAL etc, which requires being able to execute completion callbacks in a
525 * critical section.
526 */
527void
529{
531
533
534 ioh->result = result;
535
537
538 INJECTION_POINT("aio-process-completion-before-shared", ioh);
539
541
543
544 /* condition variable broadcast ensures state is visible before wakeup */
546
547 /* contains call to pgaio_io_call_complete_local() */
548 if (ioh->owner_procno == MyProcNumber)
549 pgaio_io_reclaim(ioh);
550}
551
552/*
553 * Has the IO completed and thus the IO handle been reused?
554 *
555 * This is useful when waiting for IO completion at a low level (e.g. in an IO
556 * method's ->wait_one() callback).
557 */
558bool
560{
561 *state = ioh->state;
562
563 /*
564 * Ensure that we don't see an earlier state of the handle than ioh->state
565 * due to compiler or CPU reordering. This protects both ->generation as
566 * directly used here, and other fields in the handle accessed in the
567 * caller if the handle was not reused.
568 */
570
571 return ioh->generation != ref_generation;
572}
573
574/*
575 * Wait for IO to complete. External code should never use this, outside of
576 * the AIO subsystem waits are only allowed via pgaio_wref_wait().
577 */
578static void
579pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
580{
582 bool am_owner;
583
584 am_owner = ioh->owner_procno == MyProcNumber;
585
586 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
587 return;
588
589 if (am_owner)
590 {
595 {
596 elog(PANIC, "waiting for own IO %d in wrong state: %s",
598 }
599 }
600
601 while (true)
602 {
603 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
604 return;
605
606 switch ((PgAioHandleState) state)
607 {
608 case PGAIO_HS_IDLE:
610 elog(ERROR, "IO in wrong state: %d", state);
611 break;
612
614
615 /*
616 * If we need to wait via the IO method, do so now. Don't
617 * check via the IO method if the issuing backend is executing
618 * the IO synchronously.
619 */
621 {
622 pgaio_method_ops->wait_one(ioh, ref_generation);
623 continue;
624 }
625 /* fallthrough */
626
627 /* waiting for owner to submit */
628 case PGAIO_HS_DEFINED:
629 case PGAIO_HS_STAGED:
630 /* waiting for reaper to complete */
631 /* fallthrough */
633 /* shouldn't be able to hit this otherwise */
635 /* ensure we're going to get woken up */
637
638 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
639 {
642 break;
643 ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
644 }
645
647 break;
648
651
652 /*
653 * Note that no interrupts are processed between
654 * pgaio_io_was_recycled() and this check - that's important
655 * as otherwise an interrupt could have already reclaimed the
656 * handle.
657 */
658 if (am_owner)
659 pgaio_io_reclaim(ioh);
660 return;
661 }
662 }
663}
664
665/*
666 * Make IO handle ready to be reused after IO has completed or after the
667 * handle has been released without being used.
668 *
669 * Note that callers need to be careful about only calling this in the right
670 * state and that no interrupts can be processed between the state check and
671 * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
672 * already have reclaimed the handle.
673 */
674static void
676{
677 /* This is only ok if it's our IO */
679 Assert(ioh->state != PGAIO_HS_IDLE);
680
681 /* see comment in function header */
683
684 /*
685 * It's a bit ugly, but right now the easiest place to put the execution
686 * of local completion callbacks is this function, as we need to execute
687 * local callbacks just before reclaiming at multiple callsites.
688 */
690 {
691 PgAioResult local_result;
692
693 local_result = pgaio_io_call_complete_local(ioh);
695
696 if (ioh->report_return)
697 {
698 ioh->report_return->result = local_result;
700 }
701 }
702
704 "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
706 ioh->distilled_result.id,
708 ioh->result);
709
710 /* if the IO has been defined, it's on the in-flight list, remove */
711 if (ioh->state != PGAIO_HS_HANDED_OUT)
713
714 if (ioh->resowner)
715 {
717 ioh->resowner = NULL;
718 }
719
720 Assert(!ioh->resowner);
721
722 /*
723 * Update generation & state first, before resetting the IO's fields,
724 * otherwise a concurrent "viewer" could think the fields are valid, even
725 * though they are being reset. Increment the generation first, so that
726 * we can assert elsewhere that we never wait for an IDLE IO. While it's
727 * a bit weird for the state to go backwards for a generation, it's OK
728 * here, as there cannot be references to the "reborn" IO yet. Can't
729 * update both at once, so something has to give.
730 */
731 ioh->generation++;
733
734 /* ensure the state update is visible before we reset fields */
736
737 ioh->op = PGAIO_OP_INVALID;
739 ioh->flags = 0;
740 ioh->num_callbacks = 0;
741 ioh->handle_data_len = 0;
742 ioh->report_return = NULL;
743 ioh->result = 0;
745
746 /*
747 * We push the IO to the head of the idle IO list, that seems more cache
748 * efficient in cases where only a few IOs are used.
749 */
751
753}
754
755/*
756 * Wait for an IO handle to become usable.
757 *
758 * This only really is useful for pgaio_io_acquire().
759 */
760static void
762{
763 int reclaimed = 0;
764
765 pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
769
770 /*
771 * First check if any of our IOs actually have completed - when using
772 * worker, that'll often be the case. We could do so as part of the loop
773 * below, but that'd potentially lead us to wait for some IO submitted
774 * before.
775 */
776 for (int i = 0; i < io_max_concurrency; i++)
777 {
779
781 {
782 /*
783 * Note that no interrupts are processed between the state check
784 * and the call to reclaim - that's important as otherwise an
785 * interrupt could have already reclaimed the handle.
786 *
787 * Need to ensure that there's no reordering, in the more common
788 * paths, where we wait for IO, that's done by
789 * pgaio_io_was_recycled().
790 */
792 pgaio_io_reclaim(ioh);
793 reclaimed++;
794 }
795 }
796
797 if (reclaimed > 0)
798 return;
799
800 /*
801 * If we have any unsubmitted IOs, submit them now. We'll start waiting in
802 * a second, so it's better they're in flight. This also addresses the
803 * edge-case that all IOs are unsubmitted.
804 */
807
808 /* possibly some IOs finished during submission */
810 return;
811
814 errmsg_internal("no free IOs despite no in-flight IOs"),
815 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
819
820 /*
821 * Wait for the oldest in-flight IO to complete.
822 *
823 * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
824 * for that specific IO to complete, we just need *any* IO to complete.
825 */
826 {
829 uint64 generation = ioh->generation;
830
831 switch ((PgAioHandleState) ioh->state)
832 {
833 /* should not be in in-flight list */
834 case PGAIO_HS_IDLE:
835 case PGAIO_HS_DEFINED:
837 case PGAIO_HS_STAGED:
839 elog(ERROR, "shouldn't get here with io:%d in state %d",
840 pgaio_io_get_id(ioh), ioh->state);
841 break;
842
846 "waiting for free io with %u in flight",
848
849 /*
850 * In a more general case this would be racy, because the
851 * generation could increase after we read ioh->state above.
852 * But we are only looking at IOs by the current backend and
853 * the IO can only be recycled by this backend. Even this is
854 * only OK because we get the handle's generation before
855 * potentially processing interrupts, e.g. as part of
856 * pgaio_debug_io().
857 */
858 pgaio_io_wait(ioh, generation);
859 break;
860
862
863 /*
864 * It's possible that another backend just finished this IO.
865 *
866 * Note that no interrupts are processed between the state
867 * check and the call to reclaim - that's important as
868 * otherwise an interrupt could have already reclaimed the
869 * handle.
870 *
871 * Need to ensure that there's no reordering, in the more
872 * common paths, where we wait for IO, that's done by
873 * pgaio_io_was_recycled().
874 */
876 pgaio_io_reclaim(ioh);
877 break;
878 }
879
881 elog(PANIC, "no idle IO after waiting for IO to terminate");
882 return;
883 }
884}
885
886/*
887 * Internal - code outside of AIO should never need this and it'd be hard for
888 * such code to be safe.
889 */
890static PgAioHandle *
892{
893 PgAioHandle *ioh;
894
896
897 ioh = &pgaio_ctl->io_handles[iow->aio_index];
898
899 *ref_generation = ((uint64) iow->generation_upper) << 32 |
900 iow->generation_lower;
901
902 Assert(*ref_generation != 0);
903
904 return ioh;
905}
906
907static const char *
909{
910#define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
911 switch ((PgAioHandleState) s)
912 {
914 PGAIO_HS_TOSTR_CASE(HANDED_OUT);
915 PGAIO_HS_TOSTR_CASE(DEFINED);
916 PGAIO_HS_TOSTR_CASE(STAGED);
917 PGAIO_HS_TOSTR_CASE(SUBMITTED);
918 PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
919 PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
920 PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
921 }
922#undef PGAIO_HS_TOSTR_CASE
923
924 return NULL; /* silence compiler */
925}
926
927const char *
929{
930 return pgaio_io_state_get_name(ioh->state);
931}
932
933const char *
935{
936 switch ((PgAioResultStatus) rs)
937 {
938 case PGAIO_RS_UNKNOWN:
939 return "UNKNOWN";
940 case PGAIO_RS_OK:
941 return "OK";
942 case PGAIO_RS_WARNING:
943 return "WARNING";
944 case PGAIO_RS_PARTIAL:
945 return "PARTIAL";
946 case PGAIO_RS_ERROR:
947 return "ERROR";
948 }
949
950 return NULL; /* silence compiler */
951}
952
953
954
955/* --------------------------------------------------------------------------------
956 * Functions primarily related to IO Wait References
957 * --------------------------------------------------------------------------------
958 */
959
960/*
961 * Mark a wait reference as invalid
962 */
963void
965{
967}
968
969/* Is the wait reference valid? */
970bool
972{
973 return iow->aio_index != PG_UINT32_MAX;
974}
975
976/*
977 * Similar to pgaio_io_get_id(), just for wait references.
978 */
979int
981{
983 return iow->aio_index;
984}
985
986/*
987 * Wait for the IO to have completed. Can be called in any process, not just
988 * in the issuing backend.
989 */
990void
992{
993 uint64 ref_generation;
994 PgAioHandle *ioh;
995
996 ioh = pgaio_io_from_wref(iow, &ref_generation);
997
998 pgaio_io_wait(ioh, ref_generation);
999}
1000
1001/*
1002 * Check if the referenced IO completed, without blocking.
1003 */
1004bool
1006{
1007 uint64 ref_generation;
1009 bool am_owner;
1010 PgAioHandle *ioh;
1011
1012 ioh = pgaio_io_from_wref(iow, &ref_generation);
1013
1014 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
1015 return true;
1016
1017 if (state == PGAIO_HS_IDLE)
1018 return true;
1019
1020 am_owner = ioh->owner_procno == MyProcNumber;
1021
1024 {
1025 /*
1026 * Note that no interrupts are processed between
1027 * pgaio_io_was_recycled() and this check - that's important as
1028 * otherwise an interrupt could have already reclaimed the handle.
1029 */
1030 if (am_owner)
1031 pgaio_io_reclaim(ioh);
1032 return true;
1033 }
1034
1035 /*
1036 * XXX: It likely would be worth checking in with the io method, to give
1037 * the IO method a chance to check if there are completion events queued.
1038 */
1039
1040 return false;
1041}
1042
1043
1044
1045/* --------------------------------------------------------------------------------
1046 * Actions on multiple IOs.
1047 * --------------------------------------------------------------------------------
1048 */
1049
1050/*
1051 * Submit IOs in batches going forward.
1052 *
1053 * Submitting multiple IOs at once can be substantially faster than doing so
1054 * one-by-one. At the same time, submitting multiple IOs at once requires more
1055 * care to avoid deadlocks.
1056 *
1057 * Consider backend A staging an IO for buffer 1 and then trying to start IO
1058 * on buffer 2, while backend B does the inverse. If A submitted the IO before
1059 * moving on to buffer 2, this works just fine, B will wait for the IO to
1060 * complete. But if batching were used, each backend will wait for IO that has
1061 * not yet been submitted to complete, i.e. forever.
1062 *
1063 * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1064 * allowed; error recovery will end the batch.)
1065 *
1066 * To avoid deadlocks, code needs to ensure that it will not wait for another
1067 * backend while there is unsubmitted IO. E.g. by using conditional lock
1068 * acquisition when acquiring buffer locks. To check if there currently are
1069 * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1070 * pgaio_submit_staged().
1071 *
1072 * It is not allowed to enter batchmode while already in batchmode, it's
1073 * unlikely to ever be needed, as code needs to be explicitly aware of being
1074 * called in batchmode, to avoid the deadlock risks explained above.
1075 *
1076 * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1077 * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1078 * was called.
1079 */
1080void
1082{
1084 elog(ERROR, "starting batch while batch already in progress");
1086}
1087
1088/*
1089 * Stop submitting IOs in batches.
1090 */
1091void
1093{
1095
1098}
1099
1100/*
1101 * Are there staged but unsubmitted IOs?
1102 *
1103 * See comment above pgaio_enter_batchmode() for why code may need to check if
1104 * there is IO in that state.
1105 */
1106bool
1108{
1111 return pgaio_my_backend->num_staged_ios > 0;
1112}
1113
1114/*
1115 * Submit all staged but not yet submitted IOs.
1116 *
1117 * Unless in batch mode, this never needs to be called, as IOs get submitted
1118 * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1119 * before waiting on another backend, to avoid the risk of deadlocks. See
1120 * pgaio_enter_batchmode().
1121 */
1122void
1124{
1125 int total_submitted = 0;
1126 int did_submit;
1127
1129 return;
1130
1131
1133
1136
1138
1139 total_submitted += did_submit;
1140
1141 Assert(total_submitted == did_submit);
1142
1144
1146 "aio: submitted %d IOs",
1147 total_submitted);
1148}
1149
1150
1151
1152/* --------------------------------------------------------------------------------
1153 * Other
1154 * --------------------------------------------------------------------------------
1155 */
1156
1157
1158/*
1159 * Perform AIO related cleanup after an error.
1160 *
1161 * This should be called early in the error recovery paths, as later steps may
1162 * need to issue AIO (e.g. to record a transaction abort WAL record).
1163 */
1164void
1166{
1167 /*
1168 * It is possible that code errored out after pgaio_enter_batchmode() but
1169 * before pgaio_exit_batchmode() was called. In that case we need to
1170 * submit the IO now.
1171 */
1173 {
1175
1177 }
1178
1179 /*
1180 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1181 */
1183}
1184
1185/*
1186 * Perform AIO related checks at (sub-)transactional boundaries.
1187 *
1188 * This should be called late during (sub-)transactional commit/abort, after
1189 * all steps that might need to perform AIO, so that we can verify that the
1190 * AIO subsystem is in a valid state at the end of a transaction.
1191 */
1192void
1193AtEOXact_Aio(bool is_commit)
1194{
1195 /*
1196 * We should never be in batch mode at transactional boundaries. In case
1197 * an error was thrown while in batch mode, pgaio_error_cleanup() should
1198 * have exited batchmode.
1199 *
1200 * In case we are in batchmode somehow, make sure to submit all staged
1201 * IOs, other backends may need them to complete to continue.
1202 */
1204 {
1206 elog(WARNING, "open AIO batch at end of (sub-)transaction");
1207 }
1208
1209 /*
1210 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1211 */
1213}
1214
1215/*
1216 * Need to submit staged but not yet submitted IOs using the fd, otherwise
1217 * the IO would end up targeting something bogus.
1218 */
1219void
1221{
1222 /*
1223 * Might be called before AIO is initialized or in a subprocess that
1224 * doesn't use AIO.
1225 */
1226 if (!pgaio_my_backend)
1227 return;
1228
1229 /*
1230 * For now just submit all staged IOs - we could be more selective, but
1231 * it's probably not worth it.
1232 */
1234 {
1236 "submitting %d IOs before FD %d gets closed",
1239 }
1240
1241 /*
1242 * If requested by the IO method, wait for all IOs that use the
1243 * to-be-closed FD.
1244 */
1246 {
1247 /*
1248 * As waiting for one IO to complete may complete multiple IOs, we
1249 * can't just use a mutable list iterator. The maximum number of
1250 * in-flight IOs is fairly small, so just restart the loop after
1251 * waiting for an IO.
1252 */
1254 {
1255 dlist_iter iter;
1256 PgAioHandle *ioh = NULL;
1257 uint64 generation;
1258
1260 {
1261 ioh = dclist_container(PgAioHandle, node, iter.cur);
1262
1263 generation = ioh->generation;
1264
1265 if (pgaio_io_uses_fd(ioh, fd))
1266 break;
1267 else
1268 ioh = NULL;
1269 }
1270
1271 if (!ioh)
1272 break;
1273
1275 "waiting for IO before FD %d gets closed, %u in-flight IOs",
1277
1278 /* see comment in pgaio_io_wait_for_free() about raciness */
1279 pgaio_io_wait(ioh, generation);
1280 }
1281 }
1282}
1283
1284/*
1285 * Registered as before_shmem_exit() callback in pgaio_init_backend()
1286 */
1287void
1289{
1292
1293 /* first clean up resources as we would at a transaction boundary */
1294 AtEOXact_Aio(code == 0);
1295
1296 /*
1297 * Before exiting, make sure that all IOs are finished. That has two main
1298 * purposes:
1299 *
1300 * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1301 * an AIO exiting before IO completed
1302 *
1303 * - It'd be confusing to see partially finished IOs in stats views etc
1304 */
1306 {
1308 uint64 generation = ioh->generation;
1309
1311 "waiting for IO to complete during shutdown, %u in-flight IOs",
1313
1314 /* see comment in pgaio_io_wait_for_free() about raciness */
1315 pgaio_io_wait(ioh, generation);
1316 }
1317
1318 pgaio_my_backend = NULL;
1319}
1320
1321void
1322assign_io_method(int newval, void *extra)
1323{
1326
1328}
1329
1330bool
1332{
1333 if (*newval == -1)
1334 {
1335 /*
1336 * Auto-tuning will be applied later during startup, as auto-tuning
1337 * depends on the value of various GUCs.
1338 */
1339 return true;
1340 }
1341 else if (*newval == 0)
1342 {
1343 GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1344 return false;
1345 }
1346
1347 return true;
1348}
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition: aio.c:528
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:971
int pgaio_io_get_id(PgAioHandle *ioh)
Definition: aio.c:342
PgAioBackend * pgaio_my_backend
Definition: aio.c:81
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition: aio.c:934
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:162
void assign_io_method(int newval, void *extra)
Definition: aio.c:1322
static void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
Definition: aio.c:386
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:964
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition: aio.c:483
static void pgaio_io_wait_for_free(void)
Definition: aio.c:761
#define PGAIO_HS_TOSTR_CASE(sym)
static const char * pgaio_io_state_get_name(PgAioHandleState s)
Definition: aio.c:908
void pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Definition: aio.c:266
static void pgaio_io_resowner_register(PgAioHandle *ioh)
Definition: aio.c:409
static PgAioHandle * pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
Definition: aio.c:891
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:366
void pgaio_closing_fd(int fd)
Definition: aio.c:1220
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition: aio.c:424
int io_max_concurrency
Definition: aio.c:75
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:330
bool pgaio_have_staged(void)
Definition: aio.c:1107
PgAioCtl * pgaio_ctl
Definition: aio.c:78
const IoMethodOps * pgaio_method_ops
Definition: aio.c:96
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1005
static const IoMethodOps *const pgaio_method_ops_table[]
Definition: aio.c:84
static void pgaio_io_reclaim(PgAioHandle *ioh)
Definition: aio.c:675
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:355
StaticAssertDecl(lengthof(io_method_options)==lengthof(pgaio_method_ops_table)+1, "io_method_options out of sync with pgaio_method_ops_table")
void pgaio_enter_batchmode(void)
Definition: aio.c:1081
void pgaio_submit_staged(void)
Definition: aio.c:1123
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition: aio.c:928
const struct config_enum_entry io_method_options[]
Definition: aio.c:64
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition: aio.c:559
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition: aio.c:510
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:991
void pgaio_error_cleanup(void)
Definition: aio.c:1165
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:240
int pgaio_wref_get_id(PgAioWaitRef *iow)
Definition: aio.c:980
void AtEOXact_Aio(bool is_commit)
Definition: aio.c:1193
void pgaio_shutdown(int code, Datum arg)
Definition: aio.c:1288
bool check_io_max_concurrency(int *newval, void **extra, GucSource source)
Definition: aio.c:1331
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio.c:579
void pgaio_exit_batchmode(void)
Definition: aio.c:1092
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:188
@ IOMETHOD_WORKER
Definition: aio.h:35
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_TID_INVALID
Definition: aio.h:119
PgAioOp
Definition: aio.h:88
@ PGAIO_OP_INVALID
Definition: aio.h:90
PgAioHandleFlags
Definition: aio.h:49
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
#define DEFAULT_IO_METHOD
Definition: aio.h:42
void pgaio_io_call_stage(PgAioHandle *ioh)
Definition: aio_callback.c:199
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
Definition: aio_callback.c:285
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
Definition: aio_callback.c:225
PgAioHandleState
Definition: aio_internal.h:44
@ PGAIO_HS_STAGED
Definition: aio_internal.h:66
@ PGAIO_HS_COMPLETED_SHARED
Definition: aio_internal.h:82
@ PGAIO_HS_DEFINED
Definition: aio_internal.h:59
@ PGAIO_HS_SUBMITTED
Definition: aio_internal.h:69
@ PGAIO_HS_IDLE
Definition: aio_internal.h:46
@ PGAIO_HS_HANDED_OUT
Definition: aio_internal.h:53
@ PGAIO_HS_COMPLETED_IO
Definition: aio_internal.h:72
@ PGAIO_HS_COMPLETED_LOCAL
Definition: aio_internal.h:89
#define pgaio_debug(elevel, msg,...)
Definition: aio_internal.h:382
#define pgaio_debug_io(elevel, ioh, msg,...)
Definition: aio_internal.h:395
#define PGAIO_SUBMIT_BATCH_SIZE
Definition: aio_internal.h:28
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition: aio_io.c:116
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
Definition: aio_io.c:197
bool pgaio_io_has_target(PgAioHandle *ioh)
Definition: aio_target.c:40
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
#define pg_read_barrier()
Definition: atomics.h:154
#define pg_write_barrier()
Definition: atomics.h:155
#define PG_UINT32_MAX
Definition: c.h:600
uint64_t uint64
Definition: c.h:544
uint32_t uint32
Definition: c.h:543
#define lengthof(array)
Definition: c.h:792
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1243
#define DEBUG3
Definition: elog.h:28
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define DEBUG5
Definition: elog.h:26
#define DEBUG4
Definition: elog.h:27
ProcNumber MyProcNumber
Definition: globals.c:90
bool IsUnderPostmaster
Definition: globals.c:120
volatile uint32 CritSectionCount
Definition: globals.c:45
#define newval
#define GUC_check_errdetail
Definition: guc.h:505
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
#define dclist_head_element(type, membername, lhead)
Definition: ilist.h:955
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static bool dclist_is_empty(const dclist_head *head)
Definition: ilist.h:682
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static dlist_node * dclist_pop_head_node(dclist_head *head)
Definition: ilist.h:789
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT(name, arg)
int i
Definition: isn.c:77
const IoMethodOps pgaio_sync_ops
Definition: method_sync.c:28
const IoMethodOps pgaio_worker_ops
Definition: method_worker.c:84
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:136
#define INTERRUPTS_CAN_BE_PROCESSED()
Definition: miscadmin.h:130
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:134
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
static rewind_source * source
Definition: pg_rewind.c:89
uint64_t Datum
Definition: postgres.h:70
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int ProcNumber
Definition: procnumber.h:24
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1101
void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1107
bool wait_on_fd_before_close
Definition: aio_internal.h:268
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
Definition: aio_internal.h:308
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio_internal.h:329
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
Definition: aio_internal.h:294
uint32 io_handle_off
Definition: aio_internal.h:194
dclist_head in_flight_ios
Definition: aio_internal.h:225
uint16 num_staged_ios
Definition: aio_internal.h:214
dclist_head idle_ios
Definition: aio_internal.h:197
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
Definition: aio_internal.h:215
PgAioHandle * handed_out_io
Definition: aio_internal.h:206
PgAioHandle * io_handles
Definition: aio_internal.h:252
uint32 io_handle_count
Definition: aio_internal.h:251
PgAioTargetData target_data
Definition: aio_internal.h:187
struct ResourceOwnerData * resowner
Definition: aio_internal.h:148
int32 owner_procno
Definition: aio_internal.h:131
PgAioResult distilled_result
Definition: aio_internal.h:162
dlist_node node
Definition: aio_internal.h:146
uint8 handle_data_len
Definition: aio_internal.h:128
PgAioReturn * report_return
Definition: aio_internal.h:177
uint64 generation
Definition: aio_internal.h:152
uint8 num_callbacks
Definition: aio_internal.h:116
dlist_node resowner_node
Definition: aio_internal.h:149
ConditionVariable cv
Definition: aio_internal.h:159
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
uint32 generation_upper
Definition: aio_types.h:45
uint32 aio_index
Definition: aio_types.h:35
uint32 generation_lower
Definition: aio_types.h:46
Definition: guc.h:174
dlist_node * cur
Definition: ilist.h:179
Definition: regguts.h:323
char * flag(int b)
Definition: test-ctype.c:33