How to improve this union to ensure consistent results?

https://dba.stackexchange.com/questions/127404

29-09-2020
|

Question

I've been trying to find an reliable and economic way of counting rows over a 7 day period and so far, this is the best I've been able to come up with:

Schema

CREATE TABLE IF NOT EXISTS "data" (
    "id" SERIAL,
    "hash" CHARACTER VARYING(255) NOT NULL,
    "source" CHARACTER VARYING(255) NOT NULL,
    "isFiltered" BOOLEAN NOT NULL,
    "campaignId" INTEGER NOT NULL,
    "data" JSON NOT NULL,
    "meta" JSON NOT NULL,
    "modifiedReason" CHARACTER VARYING(255) NULL DEFAULT NULL,
    "createdAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    "updatedAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    PRIMARY KEY ("id")
);

CREATE TABLE IF NOT EXISTS "campaign" (
    "id" SERIAL,
    "userId" INTEGER NOT NULL,
    "userCompanyId" INTEGER NOT NULL,
    "type" CHARACTER VARYING(255) NULL DEFAULT NULL,
    "title" CHARACTER VARYING(255) NULL DEFAULT NULL,
    "description" CHARACTER VARYING(255) NULL DEFAULT NULL,
    "sources" CHARACTER VARYING(255)[] NULL DEFAULT NULL,
    "configuration" JSON NOT NULL,
    "active" BOOLEAN NULL DEFAULT true,
    "excludedUserNames" CHARACTER VARYING(255)[] NULL DEFAULT NULL,
    "limit" INTEGER NULL DEFAULT 0,
    "startAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    "endAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    "removedAt" TIMESTAMP WITH TIME ZONE NULL DEFAULT NULL,
    "createdAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    "updatedAt" TIMESTAMP WITH TIME ZONE NOT NULL,
    PRIMARY KEY ("id")
);

Data to test with

INSERT INTO "campaign" ("id", "userId", "userCompanyId", "type", "title", "description", "sources", "configuration", "active", "excludedUserNames", "limit", "startAt", "endAt", "removedAt", "createdAt", "updatedAt") VALUES
    (1, 1, 1, E'test', E'Test', E'Test', E'{a}', E'{"query":{"accounts":[],"hashtags":["GavinTest1234","XYZ"]}}', E'true', NULL, 0, E'2016-01-25 15:06:00+00', E'2016-01-27 23:59:59+00', NULL, E'2016-01-25 15:06:27.474+00', E'2016-01-26 16:48:19.693+00');
INSERT INTO "data" ("id", "hash", "source", "isFiltered", "campaignId", "data", "meta", "modifiedReason", "createdAt", "updatedAt") VALUES
    (1, E'dHdpdHRlci02OTE5MTQ3ODcwNjA1ODQ0NDg=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":32,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T09:25:15.000Z","matchedOn":{"hashtags":["GavinTest123"],"accounts":[]}}', NULL, E'2016-01-26 09:25:15.539+00', E'2016-01-26 09:25:15.539+00'),
    (2, E'dHdpdHRlci02OTE5MjAwNDAwNTcyNzAyNzI=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":34,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T09:46:07.000Z","matchedOn":{"hashtags":["GavinTest123"],"accounts":[]}}', NULL, E'2016-01-26 09:46:07.942+00', E'2016-01-26 09:46:07.942+00'),
    (3, E'dHdpdHRlci02OTE5NjI4NjM5OTc1NTg3ODQ=', E'a', E'false', 1, E'{}', E'{"profile":{"url":"xxx","image":"xxx","username":"xxx","name":"xxx","createdAt":"2015-10-05T10:30:11.000Z"},"posts":{"total":36,"perDay":0},"friends":0,"favourites":0,"createdAt":"2016-01-26T12:36:17.000Z","matchedOn":{"hashtags":["GavinTest1234"],"accounts":[]}}', NULL, E'2016-01-26 12:36:17.724+00', E'2016-01-26 12:36:17.724+00');

THE QUERY

SELECT   q."date", 
         q."hashtag", 
         Max(q."count") AS "count" 
FROM     ( 
     SELECT     To_char(d::date, 'DD/MM/YYYY') AS "date", 
                0                              AS "count", 
                c_h.hashtag::text              AS "hashtag" 
     FROM       generate_series('2016-01-20', '2016-01-26', '1 day'::interval) d 
     INNER JOIN "campaign" c 
     ON         ( 
                          c."id" = 1) 
     INNER JOIN Json_array_elements(c.configuration->'query'->'hashtags') c_h(hashtag) 
     ON         true 
     UNION ALL 
     SELECT     To_char(cdi."createdAt"::date, 'DD/MM/YYYY') AS "date", 
                Count(cdi_h.hashtag::text)                   AS "count", 
                cdi_h.hashtag::text                          AS "hashtag" 
     FROM       "data" cdi 
     INNER JOIN "campaign" c 
     ON         ( 
                          c."id" = cdi."campaignId") 
     INNER JOIN json_array_elements(c.configuration->'query'->'hashtags') c_h(hashtag) 
     ON         true 
     INNER JOIN json_array_elements(cdi.meta->'matchedOn'->'hashtags') cdi_h(hashtag) 
     ON         ( 
                          c_h.hashtag::text = cdi_h.hashtag::text) 
     WHERE      c."id" = 1 
     AND        ( 
                          cdi."createdAt"::date >= '2016-01-20' 
               AND        cdi."createdAt"::date <= '2016-01-26') 
     GROUP BY   to_char(cdi."createdAt"::   date, 'DD/MM/YYYY'), 
               cdi_h.hashtag::text 
     ORDER BY  "date" ASC, 
               "hashtag" ASC, 
               "count" ASC ) q 
GROUP BY q."date", 
         q."hashtag";

Results

So, as you should see from the query, I am first returning a result set containing a row for each date and hashtag.

Then, I want to query my data table and return back a count for how many matches there are per day, overwriting the default values.

The query works and it isn't too costly however it's daaamn ugly and I'm sure there is a better way that I'm too blind to see.

Has anyone got any better suggestions, other than this, or my alternative of returning back only the results (grouped and counted) we have over the 7 day period and doing some processing server side.

-- EDIT --

Sorry, SQL Fiddle only has 9.3 and crashes when I try and create a fiddle for you all.

Solution

Your query can be simplified in various respects:

SELECT to_char(day, 'DD/MM/YYYY') AS date
     , hashtag
     , count(d.*)::int AS count
FROM  (
       campaign c
CROSS  JOIN json_array_elements_text(c.configuration#>'{query,hashtags}') ch(hashtag)
CROSS  JOIN (SELECT g::date AS day
             FROM generate_series(timestamp '2016-01-20', '2016-01-26', interval '1 day') g) day 
      )
NATURAL LEFT JOIN (
   SELECT "createdAt"::date AS day, dh.hashtag
   FROM   data, json_array_elements_text(meta#>'{matchedOn,hashtags}') dh(hashtag)
   WHERE  "campaignId" = 1
   AND    "createdAt" >= '2016-01-20'
   AND    "createdAt" <  '2016-01-27'
   ) d
WHERE  c.id = 1
GROUP  BY day, hashtag
ORDER  BY day, hashtag, count;

This should be faster for multiple reasons. Not least it can use a multi-column index on data("campaignId", "createdAt") - which you should create unless you have it.

LEFT JOIN is the core feature you need. Then count the column, only non-null values count ...

One of the very rare cases where a NATURAL JOIN is useful, btw. You don't need it, though. It's just a minor syntax shortcut.

And rather use timestamp input for generate_series():

Generating time series between two dates in PostgreSQL

Licensed under: CC-BY-SA with attribution

Not affiliated with dba.stackexchange