I have a database table called "Prices" which holds in itself data where there are different items with different prices and other properties.
My goal is to group this data by hour (many samples are being collected hourly). The summarized data should be inserted into table called "HourlyData".
The problem I am having is that my database which takes 2GB~ of space approximately at the moment ends up using over 40GB space during the query execution. That is bad news for me because I am using VPS and HDD capacity is limited and I end up having no space left on the device to finish the query execution.
I have included the SQLfiddle to ensure better code readability here:
http://sqlfiddle.com/#!15/38783/4
There are two functions in the sqlfiddle paste, keep in mind that sqlfiddle doesn't support functions (as far as I know), so the code won't work without adjustments but for demonstration purposes it should be enough. Also the SCHEMA is built without correct ID syntax which should auto increment themselves and be primary keys of my tables - if that's somehow relevant.
Since I am a newbie to postgresql/SQL - to maintain this ton of code, I have chopped it into temporary tables instead of one big query with many subqueries inside. I am unsure if it's a bad practice usage of temporary tables or not but I haven't noticed a difference in CACHE with both ways.
My question is, how could I go about this current execution more effectively?
I am using lots of AVG and LAG functions in the end of my second SQL function, I assume they are the ones to blame for cache building up but I haven't figured out a solution how could I do this piece by piece which would require less cache, since LAG function isn't usable with UPDATE command.
One of the solutions of course is to summarize the data more often in the future. The other solution I've came up with is to dump the database and download it from VPS and run the functions from local computer where HDD space isn't a problem. However I am hoping that this is more solvable with more experienced SQL programming - if its not then is there anything I could do to optimize the current code anyway in terms of cache size?
Table code
-- Table: "Prices"
-- DROP TABLE "Prices";
CREATE TABLE "Prices"
(
data_id integer,
level smallint,
sell_price integer,
buy_price integer,
sell_count integer,
buy_count integer, --
name character varying(100),
date timestamp without time zone DEFAULT ('now'::text)::timestamp(1) without time zone,
"ID" serial NOT NULL,
CONSTRAINT "Prices_pkey" PRIMARY KEY ("ID")
)
WITH (
OIDS=FALSE
);
-- Index: "Prices_ID_idx"
CREATE INDEX "Prices_ID_idx"
ON "Prices"
USING btree
("ID");
-- Index: "Prices_data_id_idx"
CREATE INDEX "Prices_data_id_idx"
ON "Prices"
USING btree
(data_id);
-- Table: "HourlyData"
CREATE TABLE "HourlyData"
(
data_id bigint,
name character varying(100),
date_time timestamp without time zone,
hour integer,
day integer,
buy numeric(20,4),
sell numeric(20,4),
prev_buy numeric(20,4),
prev_sell numeric(20,4),
buy_count integer,
sell_count integer,
prev_buy_count integer,
prev_sell_count integer,
ab_change numeric(10,2),
as_change numeric(10,2),
abc_change numeric(10,2),
asc_change numeric(10,2),
"ID" integer NOT NULL DEFAULT nextval('"DailyData_ID_seq"'::regclass),
CONSTRAINT "DailyData_pkey" PRIMARY KEY ("ID")
)
WITH (
OIDS=FALSE
);
-- Index: "DailyData_data_id_idx"
CREATE INDEX "DailyData_data_id_idx"
ON "HourlyData"
USING btree
(data_id);
INSERT INTO Prices
("data_id", "level", "sell_price", "buy_price", "sell_count", "buy_count", "name", "date")
VALUES
(28262, 80, 18899, 15000, 53, 66, 'random_item', '2013-12-16 01:38:07'),
(28262, 80, 18899, 15000, 53, 66, 'random_item', '2013-12-16 01:44:31'),
(28262, 80, 18987, 15000, 46, 65, 'random_item', '2013-12-16 01:30:22'),
(28262, 80, 18987, 16000, 49, 65, 'random_item', '2013-12-16 01:00:19'),
(28265, 80, 18987, 16000, 48, 64, 'random_itema', '2013-12-16 01:30:20'),
(28265, 80, 18987, 16000, 48, 64, 'random_itema', '2013-12-16 01:00:21'),
(28265, 80, 17087, 16000, 49, 63, 'random_itema', '2013-12-16 01:30:22'),
(28262, 80, 18980, 5028, 48, 62, 'random_item', '2013-12-16 10:00:28'),
(28262, 80, 18975, 5528, 50, 60, 'random_item', '2013-12-16 10:30:30'),
(28262, 80, 18975, 5228, 51, 59, 'random_item', '2013-12-16 10:00:27'),
(28262, 80, 18975, 5500, 52, 59, 'random_item', '2013-12-16 10:30:21'),
(28262, 80, 18975, 5600, 53, 59, 'random_item', '2013-12-16 10:00:23'),
(28262, 80, 18979, 5700, 50, 58, 'random_item', '2013-12-16 10:30:28'),
(28262, 80, 18977, 5028, 51, 56, 'random_item', '2013-12-16 10:00:23'),
(28264, 80, 18978, 5028, 51, 54, 'random_itemaw', '2013-12-16 10:30:25'),
(28264, 80, 18979, 5628, 50, 54, 'random_itemaw', '2013-12-16 10:00:28'),
(28264, 80, 18979, 5028, 52, 64, 'random_itemaw', '2013-12-16 10:30:26'),
(28264, 80, 18979, 15028, 52, 64, 'random_item', '2013-12-16 11:00:25'),
(28264, 80, 17977, 15028, 56, 63, 'random_item', '2013-12-16 11:30:24'),
(28264, 80, 17977, 15029, 58, 62, 'random_item', '2013-12-16 11:00:30'),
(28262, 80, 17977, 15027, 58, 62, 'random_item', '2013-12-16 11:30:22'),
(28262, 80, 16000, 15022, 59, 49, 'random_item', '2013-12-16 11:00:26'),
(28262, 80, 17979, 15021, 56, 49, 'random_item', '2013-12-16 11:30:26'),
(28262, 80, 17969, 15023, 58, 44, 'random_item', '2013-12-16 11:00:31'),
(28262, 80, 18987, 15027, 48, 44, 'random_item', '2013-12-16 12:30:33'),
(28262, 80, 20819, 15027, 40, 43, 'random_item', '2013-12-16 12:00:32'),
(28262, 80, 21810, 15034, 37, 48, 'random_item', '2013-12-16 12:30:24'),
(28262, 80, 21810, 15037, 39, 49, 'random_item', '2013-12-16 22:00:18'),
(28262, 80, 21810, 15038, 39, 49, 'random_item', '2013-12-16 22:30:25'),
(28262, 80, 21810, 15038, 39, 49, 'random_item', '2013-12-16 22:00:25'),
(28262, 80, 21710, 15039, 40, 49, 'random_item', '2013-12-16 22:30:24'),
(28262, 80, 21709, 15040, 41, 49, 'random_item', '2013-12-16 22:00:24'),
(28262, 80, 21709, 15040, 41, 49, 'random_item', '2013-12-16 22:30:22'),
(28262, 80, 21709, 15040, 41, 49, 'random_item', '2013-12-16 23:00:24'),
(28262, 80, 21709, 15041, 41, 49, 'random_item', '2013-12-16 23:30:27'),
(28266, 80, 21708, 15042, 42, 50, 'random_item1', '2013-12-17 05:00:26'),
(28266, 80, 20000, 15041, 43, 49, 'random_item1', '2013-12-17 05:30:21'),
(28266, 80, 20000, 15097, 43, 52, 'random_item1', '2013-12-17 05:00:28'),
(28262, 80, 20000, 15097, 43, 52, 'random_item', '2013-12-17 05:30:28'),
(28262, 80, 20000, 15097, 43, 52, 'random_item', '2013-12-17 05:00:31'),
(28262, 80, 20000, 15097, 44, 51, 'random_item', '2013-12-17 05:30:34'),
(28262, 80, 19997, 15097, 44, 47, 'random_item', '2013-12-17 05:00:20'),
(28262, 80, 19997, 15098, 44, 50, 'random_item', '2013-12-17 05:30:26'),
(28262, 80, 19997, 15098, 44, 50, 'random_item', '2013-12-17 05:00:24'),
(28262, 80, 19997, 15098, 44, 49, 'random_item', '2013-12-17 05:35:44'),
(28262, 80, 19996, 15098, 45, 48, 'random_item', '2013-12-17 05:00:22'),
(28262, 80, 19996, 15097, 46, 47, 'random_item', '2013-12-17 05:30:24'),
(28262, 80, 19996, 15097, 46, 47, 'random_item', '2013-12-17 05:00:29'),
(28262, 80, 19996, 15097, 46, 47, 'random_item', '2013-12-17 05:30:24'),
(28262, 80, 19996, 15041, 47, 46, 'random_item', '2013-12-17 05:00:25')
;
Functions
-- Function: percentageincrease(numeric, numeric)
CREATE OR REPLACE FUNCTION percentageincrease(lugeja numeric, nimetaja numeric)
RETURNS numeric AS
$BODY$
BEGIN
IF Nimetaja IS NULL or Nimetaja = 0
THEN RETURN 0;
ELSE
RETURN ROUND((Lugeja - Nimetaja) / Nimetaja * 100, 2);
END IF;
END;
$BODY$
LANGUAGE plpgsql;
-- Function: process_hourly_data()
CREATE OR REPLACE FUNCTION process_hourly_data()
RETURNS void AS
$BODY$
CREATE TEMP TABLE "TEMP_summarize1" AS
SELECT
prices.data_id AS data_id,
prices.name AS name,
date_part('hour', prices.date) AS hour,
date_part('day', prices.date) AS day,
prices.date AS date_var,
prices.buy_price,
prices.sell_price,
prices.sell_count,
prices.buy_count
FROM "Prices" as prices;
CREATE TEMP TABLE "TEMP_summarize2"
(
item_name character varying(100),
data_id bigint,
hour integer,
day integer,
date_var timestamp without time zone,
avgbuy smallint,
avgsell smallint,
avgsellCount smallint,
avgbuyCount smallint
);
PREPARE TEMP2 AS
INSERT INTO "TEMP_summarize2"
SELECT
MAX(whatever.name) as item_name,
whatever.data_id,
prices.hour,
MAX(prices.day) as day,
MAX(prices.date_var) as date_var,
AVG(prices.buy_price) AS avgbuy,
AVG(prices.sell_price) AS avgsell,
AVG(prices.sell_count) AS avgsellCount,
AVG(prices.buy_count) AS avgbuyCount
FROM "TEMP_summarize1" AS prices, (SELECT data_id, name FROM "TEMP_summarize1" as whatever) AS whatever
WHERE whatever.data_id = prices.data_id AND whatever.name = prices.name
GROUP BY hour, whatever.data_id;
PREPARE TEMP3 AS
INSERT INTO "HourlyData"
SELECT
data_id,
item_name,
date_var,
hour,
day,
avgbuy,
avgsell,
LAG(avgbuy, 1, NULL) OVER(PARTITION BY data_id) AS last_avgbuy,
LAG(avgsell, 1, NULL) OVER(PARTITION BY data_id) AS last_avgsell,
avgsellCount,
avgbuyCount,
LAG(avgsellCount, 1, NULL) OVER (PARTITION BY data_id) AS last_avgsellCount,
LAG(avgbuyCount, 1, NULL) OVER (PARTITION BY data_id) AS last_avgbuyCount,
percentageincrease(LAG(avgbuy, 1, NULL) OVER(PARTITION BY data_id), avgbuy),
percentageincrease(LAG(avgsell, 1, NULL) OVER(PARTITION BY data_id), avgsell),
percentageincrease(LAG(avgsellCount, 1, NULL) OVER (PARTITION BY data_id), avgsellCount),
percentageincrease(LAG(avgbuyCount, 1, NULL) OVER (PARTITION BY data_id), avgbuyCount)
FROM "TEMP_summarize2"
ORDER BY data_id, hour;
EXECUTE TEMP2;
EXECUTE TEMP3;
$BODY$
LANGUAGE sql;