PostgreSQL how to speed up ORDER BY with text column

Question

Table schema:

CREATE SEQUENCE fsa_online_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 9223372036854775807 START 1 CACHE 1;

CREATE TABLE "public"."fsa_online" (
    "id" integer DEFAULT nextval('fsa_online_id_seq') NOT NULL,
    "fsa_uuid" uuid NOT NULL,
    "use_version" integer,
    "last_original_version" integer,
    "created_at" timestamp(0),
    "updated_at" timestamp(0),
    "deleted_at" timestamp(0),
    "is_drug" boolean DEFAULT true NOT NULL,
    CONSTRAINT "fsa_online_fsa_uuid_unique" UNIQUE ("fsa_uuid"),
    CONSTRAINT "fsa_online_pkey" PRIMARY KEY ("id")
) WITH (oids = false);

CREATE INDEX "fsa_online_is_drug_index" ON "public"."fsa_online" USING btree ("is_drug");
CREATE INDEX "fsa_online_last_original_version_index" ON "public"."fsa_online" USING btree ("last_original_version");
CREATE INDEX "fsa_online_use_version_index" ON "public"."fsa_online" USING btree ("use_version");


CREATE SEQUENCE fsa_online_data_id_seq INCREMENT 1 MINVALUE 1 MAXVALUE 9223372036854775807 START 1 CACHE 1;

CREATE TABLE "public"."fsa_online_data" (
    "id" integer DEFAULT nextval('fsa_online_data_id_seq') NOT NULL,
    "fsa_id" integer NOT NULL,
    "reason" text,
    "is_original" boolean NOT NULL,
    "is_published" boolean DEFAULT true NOT NULL,
    "created_by_id" integer,
    "created_at" timestamp(0),
    "unparsed_data" jsonb,
    "raw_id" integer NOT NULL,
    "status_id" integer,
    "type_id" integer,
    "reg_num" character varying(255),
    "start_date" date,
    "end_date" date,
    "docs" jsonb,
    "docs_add" text,
    "scheme" text,
    "free_form" jsonb,
    "fio_expert" text,
    "lab_info" jsonb,
    "change_info" jsonb,
    "applicant_info" jsonb,
    "manufacturer_info" jsonb,
    "product_info" jsonb,
    "standard_info" jsonb,
    "manufacturer_tbl_info" jsonb,
    "product_tbl_info" jsonb,
    "certification_info" jsonb,
    "trade_name" text,
    "cert_num" character varying(255),
    "man_form_txt" text,
    "manufacturer_name" text,
    "man_country_id" integer,
    "serial_num" character varying(255),
    "serial_size" integer,
    "barcode" character varying(255),
    "barcode_type_id" integer,
    "is_cert" boolean NOT NULL,
    "original_data" jsonb,
    CONSTRAINT "fsa_online_data_pkey" PRIMARY KEY ("id"),
    CONSTRAINT "fsa_online_data_fsa_id_foreign" FOREIGN KEY (fsa_id) REFERENCES fsa_online(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE
) WITH (oids = false);

CREATE INDEX "fsa_online_data_barcode_index" ON "public"."fsa_online_data" USING btree ("barcode");
CREATE INDEX "fsa_online_data_barcode_type_id_index" ON "public"."fsa_online_data" USING btree ("barcode_type_id");
CREATE INDEX "fsa_online_data_cert_num_index" ON "public"."fsa_online_data" USING btree ("cert_num");
CREATE INDEX "fsa_online_data_created_by_id_index" ON "public"."fsa_online_data" USING btree ("created_by_id");
CREATE INDEX "fsa_online_data_end_date_index" ON "public"."fsa_online_data" USING btree ("end_date");
CREATE INDEX "fsa_online_data_fsa_id_index" ON "public"."fsa_online_data" USING btree ("fsa_id");
CREATE INDEX "fsa_online_data_is_cert_index" ON "public"."fsa_online_data" USING btree ("is_cert");
CREATE INDEX "fsa_online_data_man_country_id_index" ON "public"."fsa_online_data" USING btree ("man_country_id");
CREATE INDEX "fsa_online_data_raw_id_index" ON "public"."fsa_online_data" USING btree ("raw_id");
CREATE INDEX "fsa_online_data_reg_num_index" ON "public"."fsa_online_data" USING btree ("reg_num");
CREATE INDEX "fsa_online_data_start_date_index" ON "public"."fsa_online_data" USING btree ("start_date");
CREATE INDEX "fsa_online_data_status_id_index" ON "public"."fsa_online_data" USING btree ("status_id");
CREATE INDEX "fsa_online_data_trade_name_index" ON "public"."fsa_online_data" USING btree ("trade_name");
CREATE INDEX "fsa_online_data_type_id_index" ON "public"."fsa_online_data" USING btree ("type_id");

ALTER TABLE "fsa_online" ADD CONSTRAINT "fsa_online_last_original_version_foreign" FOREIGN KEY (last_original_version) REFERENCES fsa_online_data(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE;
ALTER TABLE "fsa_online" ADD CONSTRAINT "fsa_online_use_version_foreign" FOREIGN KEY (use_version) REFERENCES fsa_online_data(id) ON UPDATE CASCADE ON DELETE RESTRICT NOT DEFERRABLE;

The "fsa_online" table contains about 800 000 records (possible grow up to 3 million records)

The "fsa_online_data" table contains about 3.5 million records (possible grow up to 15-25 million records)

I have the following query:

select 
  "fsa_online"."id", 
  "fsa_online"."fsa_uuid", 
  "fsa_online"."use_version", 
  "fsa_online"."last_original_version", 
  "fsa_online"."is_drug" 
from 
  "fsa_online" 
  inner join "fsa_online_data" as "data" on "data"."id" = CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END 
where 
  "unparsed_data" is not null 
  and "fsa_online"."deleted_at" is null 
limit 10 offset 0

It took about 150 ms time.

But when I need to sort data, for example by "cert_num" column of the "fsa_online_data" table, it took an extremely long time (about 63000 ms).

select 
  "fsa_online"."id", 
  "fsa_online"."fsa_uuid", 
  "fsa_online"."use_version", 
  "fsa_online"."last_original_version", 
  "fsa_online"."is_drug" 
from 
  "fsa_online" 
  inner join "fsa_online_data" as "data" on "data"."id" = CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END 
where 
  "unparsed_data" is not null 
  and "fsa_online"."deleted_at" is null 
ORDER BY "data"."cert_num"
limit 10 offset 0

I did a little trick by adding COLLATE "C" (but that is not correct, I really need to compare strings as Unicode strings) to ORDER BY statement and query took about 27500 ms.

Here is a query plan:

"Limit  (cost=1037857.75..1037857.78 rows=10 width=545)"
"  Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, ((data.cert_num)::character varying(255))"
"  ->  Sort  (cost=1037857.75..1039085.55 rows=491120 width=545)"
"        Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, ((data.cert_num)::character varying(255))"
"        Sort Key: ((data.cert_num)::character varying(255)) COLLATE "C""
"        ->  Hash Join  (cost=972267.98..1027244.83 rows=491120 width=545)"
"              Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug, data.cert_num"
"              Inner Unique: true"
"              Hash Cond: (CASE WHEN (fsa_online.use_version IS NULL) THEN fsa_online.last_original_version ELSE fsa_online.use_version END = data.id)"
"              ->  Seq Scan on public.fsa_online  (cost=0.00..19143.06 rows=899706 width=29)"
"                    Output: fsa_online.id, fsa_online.fsa_uuid, fsa_online.use_version, fsa_online.last_original_version, fsa_online.is_drug"
"                    Filter: (fsa_online.deleted_at IS NULL)"
"              ->  Hash  (cost=934126.84..934126.84 rows=2077451 width=17)"
"                    Output: data.cert_num, data.id"
"                    ->  Seq Scan on public.fsa_online_data data  (cost=0.00..934126.84 rows=2077451 width=17)"
"                          Output: data.cert_num, data.id"
"                          Filter: (data.unparsed_data IS NOT NULL)"

Also, I need to perform queries using LIKE statements on different text columns (full-text search is not good for this case, because text columns contain an arbitrary data, like serial numbers).

select count(*)
from "fsa_online" 
inner join "fsa_online_data" as "data" on "data"."id" = 
CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END

The result is about 900 000 records. The "fsa_online_data" table size is 7GB.

Hardware list:

Samsung SSD EVO 850
Intel Core i7 6700k
16 GB DDR4 RAM

Tested on PostgreSQL versions: 9.6.9 and 10.5

How can I improve the performance of ORDER BY operations by text columns? I want to reduce query execution time to 200-300ms.

You talk about ORDER BY but you don't show the clause you actually used to produce the explain output. You talk about LIKE, but you don't show that in the query either. On which columns? Which patterns are possible? This overview of pattern matching options should be instrumental: dba.stackexchange.com/a/10696/3684 — Erwin Brandstetter
– Erwin Brandstetter, Commented Sep 15, 2018 at 1:36
@ErwinBrandstetter it is in my question. (ORDER BY "data"."cert_num") — Dmitry K.
– Dmitry K., Commented Sep 15, 2018 at 3:01
select count(*) from "fsa_online" inner join "fsa_online_data" as "data" on "data"."id" = CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END where "unparsed_data" is not null and "fsa_online"."deleted_at" is null ORDER BY "data"."cert_num" limit 10 offset 0 — Dmitry K.
– Dmitry K., Commented Sep 15, 2018 at 3:02

jjanes · Accepted Answer · 2018-09-15 20:33:17Z

4

The CASE in a join condition is killing you.

You should probably reorg your data so that use_version always contains the version to be used. If you can't do that, then create an expression index on the CASE expression:

create index on fsa_online ((CASE WHEN use_version IS NULL 
    THEN last_original_version ELSE use_version END));

answered Sep 15, 2018 at 20:33

jjanes

42.6k3 gold badges44 silver badges54 bronze badges

Thank you, removing CASE condition really made my query run much faster.

Dmitry K.
– Dmitry K.

2018-09-16 02:54:13 +00:00
Commented Sep 16, 2018 at 2:54

Add a comment |

Evan Carroll · Accepted Answer · 2018-09-15 22:33:27Z

1

Looking at your case statement,

CASE WHEN fsa_online.use_version IS NULL THEN fsa_online.last_original_version ELSE fsa_online.use_version END

That should instead be,

coalesce(use_version, last_original_version);

You can probably make it faster by doing something like this, (stupid " removed, as per the convention)

SELECT 
  fsa_online.id, 
  fsa_online.fsa_uuid, 
  fsa_online.use_version, 
  fsa_online.last_original_version, 
  fsa_online.is_drug 
FROM fsa_online_data AS data
CROSS JOIN LATERAL (
  SELECT *
  FROM fsa_online
  WHERE data.id = coalesce(use_version, last_original_version)
    AND fsa_online.deleted_at IS NULL 
) AS fsa_online
WHERE data.unparsed_data IS NOT NULL
ORDER BY cert_num
FETCH FIRST 10 ROWS ONLY;

edited Sep 15, 2018 at 22:33

answered Sep 15, 2018 at 22:25

Evan Carroll

65.8k50 gold badges263 silver badges512 bronze badges

Thanks for your answer. But i have the same time while using your version of query and my version. The most biggest problem here is an ordering by text column "cert_num", my query without ORDER BY took about 87ms, but when i adding ORDER BY clause it tooks about 63000ms.

Dmitry K.
– Dmitry K.

2018-09-16 02:34:15 +00:00
Commented Sep 16, 2018 at 2:34
Here is the query plan - explain.depesz.com/s/Y5JS

Dmitry K.
– Dmitry K.

2018-09-16 02:49:04 +00:00
Commented Sep 16, 2018 at 2:49
Nevertheless thank you, the bad place in my query was a CASE condition.

Dmitry K.
– Dmitry K.

2018-09-16 02:53:05 +00:00
Commented Sep 16, 2018 at 2:53
@DmitryK. The problem is PostgreSQL thinks that the inner query is going to return a shit ton of rows. Is the table unique on data.id = coalesce(use_version, last_original_version) AND fsa_online.deleted_at IS NULL ? If so try putting FETCH FIRST ROW ONLY in the inner query.

Evan Carroll
– Evan Carroll

2018-09-16 10:37:19 +00:00
Commented Sep 16, 2018 at 10:37

Add a comment |

Stack Exchange Network

PostgreSQL how to speed up ORDER BY with text column

2 Answers 2

Your Answer

Linked

Hot Network Questions

PostgreSQL how to speed up ORDER BY with text column

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Linked

Related

Hot Network Questions