Wärmestrom

https://learnsql.com/blog/moving-average-in-sql/ https://stackoverflow.com/questions/55491046/how-to-set-the-running-file-path-of-jupyter-in-vscode

Get Wärmestrom data

By querying directly the SQLite table and filtering by meter id. But for Wärmestrom we have two different meters, because there are two tarifs depending on the time of the day.

%%sql 

CREATE OR REPLACE TABLE waermestrom_sqlite AS 
SELECT 
  meterid, 
  -- Blob Functions, because most columns get read as blob
  -- https://duckdb.org/docs/sql/functions/blob
  decode(date)::DATETIME AS date, 
  decode(value)::INT AS value
FROM sqlite_scan('{{latest_file}}', 'reading') 
WHERE meterid = 2 OR meterid = 3
;
SELECT * FROM waermestrom_sqlite;

	meterid	date	value
0	3	2020-11-30 17:36:00	9773
1	2	2020-11-30 17:36:00	9456
2	2	2020-12-01 00:00:00	9464
3	3	2020-12-01 00:00:00	9779
4	2	2020-12-01 14:23:00	9470
...	...	...	...
4416	3	2025-11-21 20:53:00	4748
4417	2	2025-11-22 08:36:00	3516
4418	3	2025-11-22 08:36:00	4749
4419	2	2025-11-22 20:33:00	3532
4420	3	2025-11-22 20:33:00	4749

4421 rows × 3 columns

Ideally, there would be one measurement for each tarif, for every date (minute). But we cannot guarantee that’s the case (e.g. measurement for one tarif can be at 13:08:59 and for the other at 13:09:00). So let’s see if that’s the case.

%%sql 

SELECT date, count(*) AS cnt
FROM waermestrom_sqlite
GROUP BY date
;

	date	cnt
0	2020-11-30 17:36:00	2
1	2020-12-01 07:18:00	1
2	2020-12-01 07:19:00	1
3	2020-12-01 20:22:00	1
4	2020-12-03 02:38:00	2
...	...	...
2420	2025-11-11 20:36:00	2
2421	2025-11-12 08:56:00	2
2422	2025-11-16 20:09:00	2
2423	2025-11-20 20:03:00	2
2424	2025-11-22 08:36:00	2

2425 rows × 2 columns

Yeap, there are some cases with cnt=1. More precisely, the below number of cases:

%%sql 

WITH ucnt AS (
  SELECT date, count(*) AS cnt
  FROM waermestrom_sqlite
  GROUP BY date
)
SELECT cnt, COUNT(*) FROM ucnt GROUP BY cnt
;

	cnt	count_star()
0	2	1996
1	1	429

Safest would be to just join the dates and make sure to fill in the gaps with the closest value. Let’s see

%%sql

CREATE OR REPLACE TABLE waermestrom_nulls AS
WITH
ws181 AS (
  SELECT 
    'Hoch' AS tariff,
    decode(date)::DATETIME AS date, 
    decode(value)::INT AS value
  FROM sqlite_scan('{{latest_file}}', 'reading') 
  WHERE meterid = 3 
),
ws182 AS (
  SELECT 
    'Niedrig' AS tariff, 
    decode(date)::DATETIME AS date, 
    decode(value)::INT AS value
  FROM sqlite_scan('{{latest_file}}', 'reading') 
  WHERE meterid = 2
)
SELECT
  COALESCE(ws181.date, ws182.date) AS date,
  ws181.value AS value_hoch,
  ws182.value AS value_niedrig
FROM ws181 
FULL JOIN ws182 
ON ws181.date = ws182.date
ORDER BY date
;
SELECT * FROM waermestrom_nulls LIMIT 20;

	date	value_hoch	value_niedrig
0	2016-07-04 07:50:00	0	0
1	2016-07-04 08:00:00	1	1
2	2016-11-28 00:21:00	547	<NA>
3	2016-11-28 15:23:00	<NA>	484
4	2017-11-25 15:20:00	3347	<NA>
5	2017-11-25 15:23:00	<NA>	2677
6	2018-11-23 15:19:00	5747	<NA>
7	2018-11-23 15:22:00	<NA>	4893
8	2019-11-29 14:18:00	7914	<NA>
9	2019-11-29 14:22:00	<NA>	7224
10	2020-11-30 07:07:00	9764	9456
11	2020-11-30 17:36:00	9773	9456
12	2020-12-01 00:00:00	9779	9464
13	2020-12-01 07:18:00	9782	<NA>
14	2020-12-01 07:19:00	<NA>	9470
15	2020-12-01 14:23:00	9792	9470
16	2020-12-01 20:21:00	9798	<NA>
17	2020-12-01 20:22:00	<NA>	9470
18	2020-12-02 00:01:00	9800	9473
19	2020-12-02 07:29:00	9802	9476

Yeah, those are the cases: 2020-12-01 07:18:00 has value hoch, but no niedrig 2020-12-01 07:19:00 has value niedrig, but no hoch

So now we want to fill in those gaps using the value of the same column, that has the closest date. So it’s tricky, because it cannot simply be a fill-down, or fill-up. Because in one case, the correct value would be one position up, and in other case one position down. Here’s one approach (it just assumes there are no consecutive nulls for the value columns; please also note that it takes advantage of DuckDB’s flexible SQL syntax -otherwhise it would have been even longer, with a bunch of CTEs-)

%%sql

CREATE OR REPLACE TABLE waermestrom_nonulls AS
SELECT
  date,
  value_hoch, value_niedrig, 
  -- calculate minutes diff with previous and next date, to see which is closer
  -- note the use of a default value for lag/lead, substracting and adding one day
  -- for lag and lead respectively, to avoid NULLs in the first and las rows
  date_sub('minute', lag(date, 1, date - INTERVAL 1 DAY) over(order by date), date) AS minutes_lag,
  date_sub('minute', date, lead(date, 1, date + INTERVAL 1 DAY) over(order by date)) AS minutes_lead,
  -- and we want to replace null values column, with the value from closest date
  CASE
    WHEN value_hoch IS NULL AND minutes_lag <= minutes_lead 
    THEN lag(value_hoch) over(order by date)
    WHEN value_hoch IS NULL AND minutes_lag > minutes_lead 
    THEN lead(value_hoch) over(order by date)
    ELSE value_hoch
  END AS value_hoch_fix,
  CASE
    WHEN value_niedrig IS NULL AND minutes_lag <= minutes_lead 
    THEN lag(value_niedrig) over(order by date)
    WHEN value_niedrig IS NULL AND minutes_lag > minutes_lead 
    THEN lead(value_niedrig) over(order by date)
    ELSE value_niedrig
  END AS value_niedrig_fix,
  value_hoch_fix + value_niedrig_fix AS value
FROM waermestrom_nulls 
ORDER BY date
;
SELECT * FROM waermestrom_nonulls ORDER BY date;

	date	value_hoch	value_niedrig	minutes_lag	minutes_lead	value_hoch_fix	value_niedrig_fix	value
0	2016-07-04 07:50:00	0	0	1440	10	0	0	0
1	2016-07-04 08:00:00	1	1	10	211221	1	1	2
2	2016-11-28 00:21:00	547	<NA>	211221	902	547	484	1031
3	2016-11-28 15:23:00	<NA>	484	902	521277	547	484	1031
4	2017-11-25 15:20:00	3347	<NA>	521277	3	3347	2677	6024
...	...	...	...	...	...	...	...	...
2420	2025-11-21 06:33:00	4730	3507	630	767	4730	3507	8237
2421	2025-11-21 19:20:00	4743	3507	767	93	4743	3507	8250
2422	2025-11-21 20:53:00	4748	3507	93	703	4748	3507	8255
2423	2025-11-22 08:36:00	4749	3516	703	717	4749	3516	8265
2424	2025-11-22 20:33:00	4749	3532	717	1440	4749	3532	8281

2425 rows × 8 columns

Good, now we just need to calculate the consumption and create the main table.

%%sql 

CREATE OR REPLACE TABLE waermestrom AS
SELECT 
  date,
  value,
  value_hoch_fix AS value_hoch,
  value_niedrig_fix AS value_niedrig,
  minutes_lag AS minutes,
  -- add default values to lag(), to prevent null in the first row
  -- use 11kwh less than the first value which is approximately the avg consumption per day
  -- and would be equivalent to the minutes in the first row, that we set with the default
  -- of one day in the previous query 
  value - lag(value, 1, value-11) over(order by date) AS consumption,
  1.0 * consumption / minutes_lag AS cm,
  24.0 * 60.0 * consumption / minutes_lag AS consumption_day_equivalent,
  -- now calculate consumption per tariff
  value_hoch_fix - lag(value_hoch_fix, 1, value_hoch_fix-11) over(order by date) AS consumption_hoch,
  value_niedrig_fix - lag(value_niedrig_fix, 1, value_niedrig_fix-11) over(order by date) AS consumption_niedrig,
  1.0 * consumption_hoch / minutes_lag AS cm_hoch,
  1.0 * consumption_niedrig / minutes_lag AS cm_niedrig
FROM waermestrom_nonulls 
WHERE minutes > 1 --get rid of the artificially short periods
;
SELECT * FROM waermestrom ORDER BY date;

	date	value	value_hoch	value_niedrig	minutes	consumption	cm	consumption_day_equivalent	consumption_hoch	consumption_niedrig	cm_hoch	cm_niedrig
0	2016-07-04 07:50:00	0	0	0	1440	11	0.007639	11.000000	11	11	0.007639	0.007639
1	2016-07-04 08:00:00	2	1	1	10	2	0.200000	288.000000	1	1	0.100000	0.100000
2	2016-11-28 00:21:00	1031	547	484	211221	1029	0.004872	7.015212	546	483	0.002585	0.002287
3	2016-11-28 15:23:00	1031	547	484	902	0	0.000000	0.000000	0	0	0.000000	0.000000
4	2017-11-25 15:20:00	6024	3347	2677	521277	4993	0.009578	13.792897	2800	2193	0.005371	0.004207
...	...	...	...	...	...	...	...	...	...	...	...	...
2211	2025-11-21 06:33:00	8237	4730	3507	630	7	0.011111	16.000000	2	5	0.003175	0.007937
2212	2025-11-21 19:20:00	8250	4743	3507	767	13	0.016949	24.406780	13	0	0.016949	0.000000
2213	2025-11-21 20:53:00	8255	4748	3507	93	5	0.053763	77.419355	5	0	0.053763	0.000000
2214	2025-11-22 08:36:00	8265	4749	3516	703	10	0.014225	20.483642	1	9	0.001422	0.012802
2215	2025-11-22 20:33:00	8281	4749	3532	717	16	0.022315	32.133891	0	16	0.000000	0.022315

2216 rows × 12 columns

15483-15472
16695-16662

Visualize the data

%%sql 
waermestrom << SELECT * FROM waermestrom;

Again, very noisy data, with substantial variation in the consumption day equivalent and there is 1.5 years without data. But here you kinda already see the seasonal pattern of higher consumption in winter time.

import plotly.express as px
fig = px.line(waermestrom, x='date', y="consumption_day_equivalent")
fig.show()

The minutes show a similar pattern, but with a bunch of very low values (probably 1), that should be due to the combination of the two meters when they do not fall in exactly the same minute.

import plotly.express as px
fig = px.histogram(waermestrom.query("minutes < 10000"), x="minutes", marginal="box")
fig.show()

The consumption day equivalent varies also substantially, and it is of course higher than the normal strom.

import plotly.express as px
fig = px.histogram(waermestrom.query("minutes < 10000"), x="consumption_day_equivalent", marginal="box")
fig.show()

Here the pattern of minutes and consumption is not so marked as in the normal strom.

from matplotlib import pyplot
pyplot.scatter(
    waermestrom.query("minutes < 10000")["minutes"], 
    waermestrom.query("minutes < 10000")["consumption_day_equivalent"]
)

import plotly.express as px
fig = px.scatter(
    data_frame=waermestrom.query("minutes < 10000"), 
    x="minutes", 
    y="consumption_day_equivalent", hover_data=['date'],
    marginal_x="histogram", 
    marginal_y="histogram"
)
fig.show()

Consumption by hour

So, again let’s take the inefficient but straightforward way. First expand in minutes to the whole range.

%%sql

CREATE OR REPLACE TABLE waermestrom_minute_nulls AS
WITH minutes_table AS (
  SELECT UNNEST(generate_series(ts[1], ts[2], interval 1 minute)) as minute
  FROM (VALUES (
    [(SELECT MIN(date) FROM waermestrom), (SELECT MAX(DATE) FROM waermestrom)]
  )) t(ts)
)
SELECT * 
FROM minutes_table
LEFT JOIN waermestrom
ON minutes_table.minute = waermestrom.date
;
SELECT * FROM waermestrom_minute_nulls ORDER BY minute LIMIT 10;

	minute	date	value	value_hoch	value_niedrig	minutes	consumption	cm	consumption_day_equivalent	consumption_hoch	consumption_niedrig	cm_hoch	cm_niedrig
0	2016-07-04 07:50:00	2016-07-04 07:50:00	0	0	0	1440	11	0.007639	11.0	11	11	0.007639	0.007639
1	2016-07-04 07:51:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
2	2016-07-04 07:52:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
3	2016-07-04 07:53:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
4	2016-07-04 07:54:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
5	2016-07-04 07:55:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
6	2016-07-04 07:56:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
7	2016-07-04 07:57:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
8	2016-07-04 07:58:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN
9	2016-07-04 07:59:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	NaN	<NA>	<NA>	NaN	NaN

And fill in the NULLS

%%sql

CREATE OR REPLACE TABLE waermestrom_minute AS
SELECT
  minute,
  date,
  value,
  value_hoch,
  value_niedrig,
  minutes,
  consumption,
  FIRST_VALUE(cm IGNORE NULLS) OVER(
    ORDER BY minute ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING 
  ) AS cm,
  FIRST_VALUE(cm_hoch IGNORE NULLS) OVER(
    ORDER BY minute ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING 
  ) AS cm_hoch,
  FIRST_VALUE(cm_niedrig IGNORE NULLS) OVER(
    ORDER BY minute ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING 
  ) AS cm_niedrig
FROM waermestrom_minute_nulls t1
ORDER BY t1.minute
;
SELECT * FROM waermestrom_minute ORDER BY minute LIMIT 100;

	minute	date	value	value_hoch	value_niedrig	minutes	consumption	cm	cm_hoch	cm_niedrig
0	2016-07-04 07:50:00	2016-07-04 07:50:00	0	0	0	1440	11	0.007639	0.007639	0.007639
1	2016-07-04 07:51:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.200000	0.100000	0.100000
2	2016-07-04 07:52:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.200000	0.100000	0.100000
3	2016-07-04 07:53:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.200000	0.100000	0.100000
4	2016-07-04 07:54:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.200000	0.100000	0.100000
...	...	...	...	...	...	...	...	...	...	...
95	2016-07-04 09:25:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.004872	0.002585	0.002287
96	2016-07-04 09:26:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.004872	0.002585	0.002287
97	2016-07-04 09:27:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.004872	0.002585	0.002287
98	2016-07-04 09:28:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.004872	0.002585	0.002287
99	2016-07-04 09:29:00	NaT	<NA>	<NA>	<NA>	<NA>	<NA>	0.004872	0.002585	0.002287

100 rows × 10 columns

And now we can simply aggregate per day and hour, and the average will be correct, as all the rows have comparable units (consumption for one minute, with equal weight).

%%sql

consumption_hour_avg << SELECT 
  COUNT(*) AS cnt,
  hour(minute) AS hour, 
  AVG(cm)*60*24 AS cmy
FROM waermestrom_minute
GROUP BY hour(minute)
;

import plotly.express as px
fig = px.line(consumption_hour_avg, y='cmy', x='hour')
fig.show()

%%sql

consumption_hour_avg << SELECT 
  hour(minute) AS hour, 
  1.0*AVG(cm)*60*24 AS cmy
FROM waermestrom_minute
WHERE minute <= '2021-05-25' OR minute >= '2022-11-30'
GROUP BY hour(minute)
;

import plotly.express as px
fig = px.bar(consumption_hour_avg, y='cmy', x='hour')
fig.show()

Traces plot

%%sql


SELECT
  hour(minute) AS hour, 
  AVG(1.0 * 60 * 24 * cm) AS cm
FROM waermestrom_minute
WHERE minute <= '2021-05-25' OR minute >= '2022-11-30'
GROUP BY hour(minute)
;

	hour	cm
0	0	12.555668
1	1	12.493958
2	2	12.476178
3	3	12.472235
4	4	12.453475
5	5	12.422241
6	6	12.365850
7	7	12.334612
8	8	12.197833
9	9	-259.360931
10	10	11.985778
11	11	11.884183
12	12	11.770479
13	13	11.727219
14	14	11.726939
15	15	11.685296
16	16	11.646515
17	17	11.598061
18	18	11.661448
19	19	11.899221
20	20	12.360588
21	21	12.683409
22	22	12.762085
23	23	12.692283

%%sql

SELECT 
  minute,
  date,
  1.0 * 60 * 24 * cm AS cm,
  AVG(1.0 * 60 * 24 * cm) OVER(
    ORDER BY minute ROWS BETWEEN 240 PRECEDING AND CURRENT ROW
  ) AS cmma
FROM waermestrom_minute
WHERE minute > '2022-11-30'
;

	minute	date	cm	cmma
0	2022-11-30 00:01:00	NaT	11.974301	11.974301
1	2022-11-30 00:02:00	NaT	11.974301	11.974301
2	2022-11-30 00:03:00	NaT	11.974301	11.974301
3	2022-11-30 00:04:00	NaT	11.974301	11.974301
4	2022-11-30 00:05:00	NaT	11.974301	11.974301
...	...	...	...	...
1567948	2025-11-22 20:29:00	NaT	32.133891	32.133891
1567949	2025-11-22 20:30:00	NaT	32.133891	32.133891
1567950	2025-11-22 20:31:00	NaT	32.133891	32.133891
1567951	2025-11-22 20:32:00	NaT	32.133891	32.133891
1567952	2025-11-22 20:33:00	2025-11-22 20:33:00	32.133891	32.133891

1567953 rows × 4 columns

%%sql

CREATE OR REPLACE TABLE toy AS
WITH hourly_average AS (
  SELECT
    hour(minute) AS hour, 
    AVG(1.0 * 60 * 24 * cm) AS cmha
  FROM waermestrom_minute
  --This was originally here, because we wanted to see the hourly variation
  --and keeping this long period without measurements just smoothed things
  --but for waermestrom, it has a misleading implication: since we have 
  --measurements mostly in the wintertime, the average without this period
  --is high, reflecting the higher energy consumption during winter
  --WHERE minute <= '2021-05-25' OR minute >= '2022-11-30'
  GROUP BY hour(minute)
),
last_measurements AS (
  SELECT 
    minute,
    date,
    1.0 * 60 * 24 * cm AS cm,
    AVG(1.0 * 60 * 24 * cm) OVER(
      ORDER BY minute ROWS BETWEEN 60*4 PRECEDING AND CURRENT ROW
    ) AS cmma
  FROM waermestrom_minute
  WHERE minute >= '2022-11-30'
)
SELECT *, CASE WHEN date IS NOT NULL THEN cm ELSE NULL END AS cmdate
FROM last_measurements
LEFT JOIN hourly_average
ON hour(last_measurements.minute) = hourly_average.hour
;

	Success

%sql toy << SELECT * FROM toy;

import plotly.graph_objects as go
fig = px.area(toy, x='minute', y='cmha')
fig.add_trace(go.Scatter(
  x=toy['minute'], y=toy['cmma'], mode='lines', showlegend=False
))
fig.add_trace(go.Scatter(
  x=toy['date'], y=toy['cmdate'], mode='markers', showlegend=False
))

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=7,
                     label="7d",
                     step="day",
                     stepmode="backward"),
                dict(count=15,
                     label="15d",
                     step="day",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.show()

TODO: fix the range, which range?

Let’s see the rate hoch tarif vs. niedrig

niedrig_fraction << SELECT date AS day, AVG(cm_niedrig/cm) AS niedrig_fraction, AVG(cm_niedrig)/AVG(cm) AS niedrig_fraction2, AVG(niedrig_fraction) OVER( ORDER BY minute ROWS BETWEEN 60 * 24 * 7 PRECEDING AND CURRENT ROW ) AS niedrig_fraction_ma FROM waermestrom_minute GROUP BY date ORDER BY date ;

import plotly.express as px fig = px.bar(niedrig_fraction, x=‘day’, y=‘niedrig_fraction2’) fig.show()