How to improve this union to ensure consistent results?
-
29-09-2020 - |
Question
I've been trying to find an reliable and economic way of counting rows over a 7 day period and so far, this is the best I've been able to come up with:
Schema
CREATE TABLE IF NOT EXISTS "data" (
"id" SERIAL,
"hash" CHARACTER VARYING(255) NOT NULL,
"source" CHARACTER VARYING(255) NOT NULL,
"isFiltered" BOOLEAN NOT NULL,
"campaignId" INTEGER NOT NULL,
"data" JSON NOT NULL,
"meta" JSON NOT NULL,
"modifiedReason" CHARACTER VARYING(255) NULL DEFAULT NULL,
"createdAt" TIMESTAMP WITH TIME ZONE NOT NULL,
"updatedAt" TIMESTAMP WITH TIME ZONE NOT NULL,
PRIMARY KEY ("id")
);
CREATE TABLE IF NOT EXISTS "campaign" (
"id" SERIAL,
"userId" INTEGER NOT NULL,
"userCompanyId" INTEGER NOT NULL,
"type" CHARACTER VARYING(255) NULL DEFAULT NULL,
"title" CHARACTER VARYING(255) NULL DEFAULT NULL,
"description" CHARACTER VARYING(255) NULL DEFAULT NULL,
"sources" CHARACTER VARYING(255)[] NULL DEFAULT NULL,
"configuration" JSON NOT NULL,
"active" BOOLEAN NULL DEFAULT true,
"excludedUserNames" CHARACTER VARYING(255)[] NULL DEFAULT NULL,
"limit" INTEGER NULL DEFAULT 0,
"startAt" TIMESTAMP WITH TIME ZONE NOT NULL,
"endAt" TIMESTAMP WITH TIME ZONE NOT NULL,
"removedAt" TIMESTAMP WITH TIME ZONE NULL DEFAULT NULL,
"createdAt" TIMESTAMP WITH TIME ZONE NOT NULL,
"updatedAt" TIMESTAMP WITH TIME ZONE NOT NULL,
PRIMARY KEY ("id")
);
Data to test with
INSERT INTO "campaign" ("id", "userId", "userCompanyId", "type", "title", "description", "sources", "configuration", "active", "excludedUserNames", "limit", "startAt", "endAt", "removedAt", "createdAt", "updatedAt") VALUES
(1, 1, 1, E'test', E'Test', E'Test', E'{a}', E'{"query":{"accounts":[],"hashtags":["GavinTest1234","XYZ"]}}', E'true', NULL, 0, E'2016-01-25 15:06:00+00', E'2016-01-27 23:59:59+00', NULL, E'2016-01-25 15:06:27.474+00', E'2016-01-26 16:48:19.693+00');
INSERT INTO "data" ("id", "hash", "source", "isFiltered", "campaignId", "data", "meta", "modifiedReason", "createdAt", "updatedAt") VALUES
(1, E'dHdpdHRlci02OTE5MTQ3ODcwNjA1ODQ0NDg=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":32,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T09:25:15.000Z","matchedOn":{"hashtags":["GavinTest123"],"accounts":[]}}', NULL, E'2016-01-26 09:25:15.539+00', E'2016-01-26 09:25:15.539+00'),
(2, E'dHdpdHRlci02OTE5MjAwNDAwNTcyNzAyNzI=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":34,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T09:46:07.000Z","matchedOn":{"hashtags":["GavinTest123"],"accounts":[]}}', NULL, E'2016-01-26 09:46:07.942+00', E'2016-01-26 09:46:07.942+00'),
(3, E'dHdpdHRlci02OTE5NjI4NjM5OTc1NTg3ODQ=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":36,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T12:36:17.000Z","matchedOn":{"hashtags":["GavinTest1234"],"accounts":[]}}', NULL, E'2016-01-26 12:36:17.724+00', E'2016-01-26 12:36:17.724+00');
THE QUERY
SELECT q."date",
q."hashtag",
Max(q."count") AS "count"
FROM (
SELECT To_char(d::date, 'DD/MM/YYYY') AS "date",
0 AS "count",
c_h.hashtag::text AS "hashtag"
FROM generate_series('2016-01-20', '2016-01-26', '1 day'::interval) d
INNER JOIN "campaign" c
ON (
c."id" = 1)
INNER JOIN Json_array_elements(c.configuration->'query'->'hashtags') c_h(hashtag)
ON true
UNION ALL
SELECT To_char(cdi."createdAt"::date, 'DD/MM/YYYY') AS "date",
Count(cdi_h.hashtag::text) AS "count",
cdi_h.hashtag::text AS "hashtag"
FROM "data" cdi
INNER JOIN "campaign" c
ON (
c."id" = cdi."campaignId")
INNER JOIN json_array_elements(c.configuration->'query'->'hashtags') c_h(hashtag)
ON true
INNER JOIN json_array_elements(cdi.meta->'matchedOn'->'hashtags') cdi_h(hashtag)
ON (
c_h.hashtag::text = cdi_h.hashtag::text)
WHERE c."id" = 1
AND (
cdi."createdAt"::date >= '2016-01-20'
AND cdi."createdAt"::date <= '2016-01-26')
GROUP BY to_char(cdi."createdAt":: date, 'DD/MM/YYYY'),
cdi_h.hashtag::text
ORDER BY "date" ASC,
"hashtag" ASC,
"count" ASC ) q
GROUP BY q."date",
q."hashtag";
Results
So, as you should see from the query, I am first returning a result set containing a row for each date and hashtag.
Then, I want to query my data table and return back a count for how many matches there are per day, overwriting the default values.
The query works and it isn't too costly however it's daaamn ugly and I'm sure there is a better way that I'm too blind to see.
Has anyone got any better suggestions, other than this, or my alternative of returning back only the results (grouped and counted) we have over the 7 day period and doing some processing server side.
-- EDIT --
Sorry, SQL Fiddle only has 9.3 and crashes when I try and create a fiddle for you all.
Solution
Your query can be simplified in various respects:
SELECT to_char(day, 'DD/MM/YYYY') AS date
, hashtag
, count(d.*)::int AS count
FROM (
campaign c
CROSS JOIN json_array_elements_text(c.configuration#>'{query,hashtags}') ch(hashtag)
CROSS JOIN (SELECT g::date AS day
FROM generate_series(timestamp '2016-01-20', '2016-01-26', interval '1 day') g) day
)
NATURAL LEFT JOIN (
SELECT "createdAt"::date AS day, dh.hashtag
FROM data, json_array_elements_text(meta#>'{matchedOn,hashtags}') dh(hashtag)
WHERE "campaignId" = 1
AND "createdAt" >= '2016-01-20'
AND "createdAt" < '2016-01-27'
) d
WHERE c.id = 1
GROUP BY day, hashtag
ORDER BY day, hashtag, count;
This should be faster for multiple reasons. Not least it can use a multi-column index on data("campaignId", "createdAt")
- which you should create unless you have it.
LEFT JOIN
is the core feature you need. Then count the column, only non-null values count ...
One of the very rare cases where a NATURAL JOIN
is useful, btw. You don't need it, though. It's just a minor syntax shortcut.
And rather use timestamp
input for generate_series()
: