import epyfun
stations_file = "interim/climate/latest/TU_Stundenwerte_Beschreibung_Stationen.txt"
epyfun.web.download_file("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/TU_Stundenwerte_Beschreibung_Stationen.txt", stations_file)
epyfun.fs.convert_to_utf8(stations_file)DWD Daten
Deutscher Wetterdienst
https://www.dwd.de/DE/leistungen/cdc/cdc_ueberblick-klimadaten.html
stundliche Auflösung Wetter Stationen
laden wir die Stationslisten herunter mit allen Klimastationen, die Lufttemperatur - stündliche Auflösung haben.
Zeige die Stationen kurz auf einer Karte an, wobei sich Brannenburg in der Mitte befindet, so dass es leicht ersichtlich ist, welche Stationen näher liegen.
import pandas as pd
df = pd.read_fwf(
stations_file,
colspecs="infer",
skiprows=3,
names=[
"Stations_id",
"von_datum",
"bis_datum",
"Stationshoehe",
"geoBreite",
"geoLaenge",
"Stationsname",
"Bundesland",
],
)import folium
from folium.plugins import MarkerCluster
import pandas as pd
brannenburg_coords = [47.7424, 12.1041] # source wikipedia
my_map = folium.Map(location=brannenburg_coords, zoom_start=10)
for index, row in df.iterrows():
folium.Marker(
[row["geoBreite"], row["geoLaenge"]],
popup=f"""
{row["Stationsname"]} ({str(row["Stations_id"])})
<br/>
{str(row["Stationshoehe"])} m2
<br/>
Von: {str(row["von_datum"])}
<br/>
Bis: {str(row["bis_datum"])}
""",
).add_to(my_map)
my_mapAlso, wir sollten entweder Kiefersfelden-Gach (3679) oder Rosenheim (4261) verwenden. Wendelstein (5467) ist ebenfalls näher, aber höher (832 m²). Daher wäre es möglicherweise besser, Kiefersfelden oder Rosenheim zu wählen, da deren Höhen (518 m² bzw. 442 m²) viel ähnlicher sind als die Höhe von Brannenburg (509 m² laut Wikipedia).
Dann schreiben wir eine kleine Funktion, die nach Dateien für solche Stationen sucht und sie herunterlädt. (Eine Funktion, da wir dies mindestens zweimal machen müssen, weil es Dateien mit historischen Daten und Dateien mit aktuellen Daten gibt.)
fns to support download data
import requests
from bs4 import BeautifulSoup
def download_climate_data(listing_url):
# Fetch HTML content
response = requests.get(listing_url)
html_content = response.text
# Parse HTML content
soup = BeautifulSoup(html_content, "html.parser")
# Extract links that contains the stations ids, with the codes
# for the stations of kiefersfelden or rosenheim
target_links = [
a["href"]
for a in soup.find_all("a", href=True)
if "_03679_" in a["href"] or "_04261_" in a["href"]
]
downloaded_files = []
for link in target_links:
print("Downloading: ", link)
epyfun.web.download_file(
listing_url + "/" + link, "interim/climate/latest/" + link
)
downloaded_files.append("interim/climate/latest/" + link)
return downloaded_filesimport pandas as pd
import zipfile
import io
def read_csv_from_zip(zip_file_path):
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
# Get the list of file names in the zip file
file_names = zip_file.namelist()
# Filter the file names that start with the specified prefix
target_files = [file_name for file_name in file_names if file_name.startswith("produkt")]
print(target_files)
if not target_files:
raise Exception("No files start with produkt")
if len(target_files) > 1:
raise Exception("There are more files that start with produkt. Double check")
target_file_name = target_files[0]
# Read the CSV file directly from the zip archive using pandas
with zip_file.open(target_file_name) as file_in_zip:
df = pd.read_csv(io.TextIOWrapper(file_in_zip), sep=';') # Adjust the separator based on the file format
return dfdef bind_rows_files(all_files):
# all_files = ["interim/climate/latest/stundenwerte_TU_04261_akt.zip",
# "./interim/climate/latest/stundenwerte_TU_04261_20060301_20221231_hist.zip"]
#read_csv_from_zip("interim/climate/latest/stundenwerte_TU_04261_akt.zip")
all_dfs = [read_csv_from_zip(file) for file in all_files]
result_df = pd.concat(all_dfs, ignore_index=True)
result_df['MESS_DATUM'] = pd.to_datetime(result_df['MESS_DATUM'], format='%Y%m%d%H')
return result_dfLufttemperatur - stündliche Auflösung
historical_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/")
recent_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/recent/")
air = bind_rows_files(historical_files + recent_files)
aircols = ["TT_TU", "RF_TU"]Es gibt komische Werte hier. Es scheint, dass -999 als Missing Value-Indikator verwendet wird (für RF_TU, relative Feuchte).
import numpy as np
air.replace(-999, np.nan, inplace=True)import plotly.express as px
# Filter rows where the date is greater than 2020
filtered_df = air[air['MESS_DATUM'] > '2020-01-01']
fig = px.line(
filtered_df,
x="MESS_DATUM",
y=aircols,
labels={"value": "Values"},
line_shape="linear"
)
fig.show()
epyfun.splom(filtered_df, aircols + ["MESS_DATUM"]).show()Taupunkttemperatur - stündliche Auflösung
historical_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/dew_point/historical/")
recent_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/dew_point/recent/")
dew = bind_rows_files(historical_files + recent_files)
dewcols = [" TT", " TD"]Es gibt komische Werte hier. Es scheint, dass -999 als Missing Value-Indikator verwendet wird (für RF_TU, relative Feuchte).
import numpy as np
dew.replace(-999, np.nan, inplace=True)import plotly.express as px
# Filter rows where the date is greater than 2020
filtered_df = dew[dew['MESS_DATUM'] > '2020-01-01']
fig = px.line(
filtered_df,
x="MESS_DATUM",
y=dewcols,
labels={"value": "Values"},
line_shape="linear"
)
fig.show()
epyfun.splom(filtered_df, dewcols + ["MESS_DATUM"]).show()Moisture - stündliche Auflösung
historical_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/moisture/historical/")
recent_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/moisture/recent/")
moist = bind_rows_files(historical_files + recent_files)
moistcols = ['VP_STD', 'TF_STD', 'P_STD', 'TT_STD', 'RF_STD', 'TD_STD']import plotly.express as px
# Filter rows where the date is greater than 2020
filtered_df = moist[moist['MESS_DATUM'] > '2020-01-01']
fig = px.line(
filtered_df,
x="MESS_DATUM",
y=moistcols,
labels={"value": "Values"},
line_shape="linear"
)
fig.show()
epyfun.splom(filtered_df, moistcols + ["MESS_DATUM"]).show()Precipitaci - stündliche Auflösung
historical_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/precipitation/historical/")
recent_files = download_climate_data("https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/precipitation/recent/")
precip = bind_rows_files(historical_files + recent_files)
precipcols = [' R1', 'RS_IND', 'WRTR']Es gibt komische Werte hier. Es scheint, dass -999 als Missing Value-Indikator verwendet wird.
import numpy as np
precip.replace(-999, np.nan, inplace=True)import plotly.express as px
# Filter rows where the date is greater than 2020
filtered_df = precip[precip['MESS_DATUM'] > '2020-01-01']
fig = px.line(
filtered_df,
x="MESS_DATUM",
y=precipcols,
labels={"value": "Values"},
line_shape="linear"
)
fig.show()
epyfun.splom(filtered_df, precipcols + ["MESS_DATUM"]).show()join in a single table and mach es auf täglicheauslösung
oncols = ["MESS_DATUM", "STATIONS_ID"]
agg_df = air.merge(dew[oncols + dewcols], on=oncols)
agg_df = agg_df.merge(moist[oncols + moistcols], on=oncols)
agg_df = agg_df.merge(precip[oncols + precipcols], on=oncols)datacols = aircols + dewcols + moistcols + precipcols
agg_df["date"] = agg_df["MESS_DATUM"].dt.normalize()
agg_df = (
agg_df.groupby("date")
.agg({key: ["min", "mean", "max"] for key in datacols})
.reset_index()
)
agg_df = epyfun.pandas.clean_names(agg_df)#epyfun.splom(agg_df).show("browser")merge everything and save the file
# agg_df.to_parquet('./interim/climate/climate_daily.parquet', index=False)