""" Author: Anthony Gray, ORE Catapult Date: 10 July 2020 Modified: 31 May 2021 (prepared for public release) Purpose: Extract ERA5 weather time-series data to CSV for any location (1979+ available, 0.5 decimal degree resolution) Warning: This process can take a long time - e.g. a 30 year dataset with wind speed, Hs and Tp will take 10 hours Accompanying documentation: https://ore.catapult.org.uk/analysisinsight/weather-time-series-data-in-offshore-renewables/ __________ Instructions for using this script __________ 1. Re-save the script in a folder as .py file 2. Download Python 3 via Anaconda - Link: https://www.anaconda.com/products/individual - Go to the Anaconda website and download Python 3 (whichever version is most recent) - Stick with any recommended settings throughout the installation - Remember where you’ve installed it because you need that ‘path’ in the next step - e.g. mine is 'C:/Users/anthony.gray/AppData/Local/Continuum/anaconda3' (use back slashes, not forward slashes like the example) 3. Register with Copernicus and add the CDS configuration - Register an account: https://cds.climate.copernicus.eu/cdsapp#!/home - You then need to follow the Step-by-step guide on this page: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+Windows - On prerequisites, you’ve done point 1, and just ignore points 2 and 3 - To complete prerequisites point 4, open Anaconda Prompt (you can find it by typing into your toolbar) [*3b] - Then type in ‘conda config --add channels conda-forge’ and press enter - When that is done, type in ‘conda install cdsapi’ and press enter, when prompted press ‘y’ and enter - Now on the Step-by-step guide, follow the points. Some info which may help: - You’ll need to create the ‘.cdsapirc’ file: https://confluence.ecmwf.int/pages/viewpage.action?pageId=139068264 - Put the ‘.cdsapirc’ file in your C drive – e.g. mine is in ‘C:/Users/anthony.gray’ – and copy in the two-line code (back slashes) - For point 4, remember you may need to change directory [*3b] in the Anaconda prompt before attempting the pip commands – ‘pip3 install cdsapi’ or ‘pip3 install --user cdsapi’ - On point 5, you may need to accept the Copernicus terms and conditions if not already done (https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products) *3b - if Anaconda prompt gives an error at this point then you will likely need to change the 'path' - Type in ‘cd ‘ followed by the path where your Python executable is – e.g. I would type ‘cd C:/Users/anthony.gray/AppData/Local/Continuum/anaconda3’ (back slashes) 4. Install xarray - You need the xarray package to make the code work - Open your Anaconda Prompt again, change directory like before, then type in ‘conda install -c anaconda xarray’ and press enter, when prompted press ‘y’ and enter - I’ve had issues when doing this on my personal laptop – I had to ‘conda uninstall numpy’ twice before being able to run my code – but hopefully the above points will suffice for you. 5. Open the script in an Integrated Development Environments (IDE) - If you are a new user of Python then I recommend you use Spyder as your IDE (already installed through Anaconda) - Open Spyder by searching for it in your toolbar - Open the Python script in your IDE 6. Define your user settings in the script - You only need to change variables in the USER DEFINED INPUTS - HIGH PRIORITY section near the top of the script - I recommend you keep the 'site_name' quite short as the script will add the coordinates and years to the output filename - The ‘site_coordinates’ (latitude, longitude) need to be in decimal degrees and 0.5 degree resolution is required. - Set the ‘start_year’ and ‘end_year’ – remember that it is inclusive (so 2000 to 2001 will get two years of data) - Selecting your parameters should be self explanatory - you can select multiple parameters - If extracting wind speed, remember to set ‘wind_speed_in_knots’ to False if you want wind speed in m/s. Otherwise, leave as True if you want knots. 7. Run the script. - Press F5 or the green ‘play’ button on the top toolbar (if using Spyder) - Remember that it takes a while to run, but you can do other things (e.g. Excel, Word etc.) whilst it is running - The output CSV file will be stored in the ‘Output_CSVs’ folder (a subfolder of where you've stored the script) - I suggest you run the script in one go, so leave your computer on overnight if needs be – it is possible to stop and re-start but I wouldn’t recommend it as it would involve fiddling with settings. Connection to the API is lost occasionally which delays the script, but it will complete in time. ________________________________________________________ Documentation for ERA5 hourly data (e.g. if you want to check units or alternative parameters): "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels?tab=overview" """ import time import os import cdsapi import pandas as pd import xarray as xr import numpy as np class ExtractCDSData(object): def __init__(self): """ Location for user to define key inputs for extraction """ """ USER DEFINED INPUTS - HIGH PRIORITY """ # Need to be checked and modified on a case by case basis # Define what you want your output file to be called (short as coordinates and years are in the name too) self.site_name = "NE1" # Define where you want data to be extracted - must use 0.5 degrees spatial resolution (check location on map) self.site_coordinates = (60.0, 0.0) # latitude, longitude (both decimal degrees format) # Define what years to want to extract data between (from 1 Jan in start year to 31 dec in end year) self.start_year = 1991 self.end_year = 2020 # Define what data you want to extract self.get_wind_speed_10m = False # Wind speed at 10m height, set as True or False self.get_wind_speed_100m = True # Wind speed at 100m height, set as True or False self.wind_speed_in_knots = True # True for knots, False for m/s self.get_wind_direction_10m = False # Wind direction (from) at 10m height, set as True or False self.get_wind_direction_100m = False # Wind direction (from) at 100m height, set as True or False self.get_significant_wave_height = True # Hs, set as True or False self.get_peak_wave_period = True # Tp, set as True or False self.get_wave_direction = False # Set as True or False self.get_air_density = False # Set as True or False self.get_temperature = False # Set as True or False """ USER DEFINED INPUTS - LOW PRIORITY """ # Can be changed by user but will default to values otherwise # Column headers self.c_wind_speed_10m = 'wind_speed_10m' # units are added depending of user selection for wind_speed_in_knots self.c_wind_speed_100m = 'wind_speed_100m' # as above self.c_wind_direction_10m = 'wind_direction_10m_deg' self.c_wind_direction_100m = 'wind_direction_100m_deg' self.c_wave_height = 'wave_height_m' self.c_wave_period = 'wave_peak_period_s' self.c_wave_direction = 'wave_direction_deg' self.c_air_density = 'air_density_kgm-3' self.c_temperature = 'temperature_K' self.netcdf_dir = os.getcwd() + '\\Downloaded_NetCDFs' # Folder will be created if it doesn't exist self.output_csv_dir = os.getcwd() + '\\Output_CSVs' # Folder will be created if it doesn't exist self.process_data_only = False # if you have already downloaded the netcdf files then set this to True self.delete_netcdfs_after_use = True # True if you want to delete the netcdf files after conversion to csv """ NO USER INPUT NEEDED FROM HERE ONWARDS """ """ CHECK INPUTS """ self.check_site_coordinates() self.check_directories() """ OTHER """ # Add units for wind speed column if self.wind_speed_in_knots: self.c_wind_speed_10m += '_kts' self.c_wind_speed_100m += '_kts' else: self.c_wind_speed_10m += '_mps' self.c_wind_speed_100m += '_mps' # API client self.api_c = cdsapi.Client() # Empty dataframe for data self.data_df = pd.DataFrame() # Map user inputs for data to era5 parameter names self.era5_parameters_names = self.map_parameters() # Initialise api request details dictionary self.api_request_details = { 'variable': None, 'product_type': 'reanalysis', 'year': None, 'month': [str(m).zfill(2) for m in range(1, 13)], 'day': [str(d).zfill(2) for d in range(1, 32)], 'time': [str(hr).zfill(2)+":00" for hr in range(24)], 'format': 'netcdf', 'area': None, } """ RUN FUNCTIONS """ # Run extraction and data processing if not self.process_data_only: self.retrieve_era5_data() # extract relevant data using API self.convert_netcdf_to_df() # convert the downloaded NetCDF files to pandas DataFrames self.calculate_wind_parameters() # need to use U and V component of wind to calculate wind speed and direction self.print_data_to_csv() # print the data to csv if self.delete_netcdfs_after_use: self.delete_netcdf_files() # if finished with the netcdf files then delete def check_site_coordinates(self): """ Raise an error if the input site parameter are not 0.5 degree resolution (to match ERA5) """ for i in self.site_coordinates: if i % 0.5 != 0: raise TypeError("Input site coordinates need to be 0.5 degree resolution") def check_directories(self): """ Create the directories if they don't already exist """ for this_dir in [self.netcdf_dir, self.output_csv_dir]: if not os.path.exists(this_dir): os.makedirs(this_dir) def map_parameters(self) -> dict: """ Use the user inputs for desired parameters to map to the correct parameter name in the era5 dataset Note: wind speed and direction is calculated later from U and V components :return: a dict with keys as column names or identifiers with values as era5 parameter names """ parameter_names = {} if self.get_wind_speed_10m or self.get_wind_direction_10m: parameter_names['U_wind_10m'] = '10m_u_component_of_wind' parameter_names['V_wind_10m'] = '10m_v_component_of_wind' if self.get_wind_speed_100m or self.get_wind_direction_100m: parameter_names['U_wind_100m'] = '100m_u_component_of_wind' parameter_names['V_wind_100m'] = '100m_v_component_of_wind' if self.get_significant_wave_height: parameter_names[self.c_wave_height] = 'significant_height_of_combined_wind_waves_and_swell' if self.get_peak_wave_period: parameter_names[self.c_wave_period] = 'peak_wave_period' if self.get_wave_direction: parameter_names[self.c_wave_direction] = 'mean_wave_direction' if self.get_air_density: parameter_names[self.c_air_density] = 'air_density_over_the_oceans' if self.get_temperature: parameter_names[self.c_temperature] = '2m_temperature' return parameter_names def retrieve_era5_data(self): """ Go through each user-specified parameter and year and extract ERA5 data using the API (takes a long time!) return: use the api CDS client for each parameter and year and save in netcdf files """ # Loop through all parameters in the era5 parameter names inputs for parameter, era5_parameter_name in self.era5_parameters_names.items(): # Loop through all years specified by the user for year in range(self.start_year, self.end_year + 1): # Tell user where we are print(f"Extracting {parameter} in year {year}...") # Make a copy of the request details template and amend accordingly request_details = self.api_request_details.copy() request_details['variable'] = era5_parameter_name request_details['year'] = str(year) request_details['area'] = [str(self.site_coordinates[0]), str(self.site_coordinates[1]), str(self.site_coordinates[0]), str(self.site_coordinates[1])] # Call API and produce output file output_file = os.path.join(self.netcdf_dir, f"{parameter}_{year}.nc") if not os.path.exists(output_file): self.api_c.retrieve('reanalysis-era5-single-levels', request_details, output_file) def convert_netcdf_to_df(self): """ Convert the netcdf files to a single pandas DataFrame :return: update self.data_df to be a DataFrame storing all the data """ print("Processing downloaded data...") # Loop through all parameters in the era5 parameter names inputs for parameter, era5_parameter_name in self.era5_parameters_names.items(): df_list = [] # initialise list of DataFrames for this parameter # Loop through all years specified by the user for year in range(self.start_year, self.end_year + 1): # Get netcdf file for this parameter and year and open it netcdf_file = os.path.join(self.netcdf_dir, f"{parameter}_{year}.nc") if os.path.exists(netcdf_file): data = xr.open_dataset(netcdf_file) # Convert the data into a pandas DataFrame df = data.to_dataframe() df.index = df.index.droplevel().droplevel() # don't need to keep the location coordinates in index df.columns = [parameter] # call the existing data the right name df_list.append(df) else: print(f"WARNING: can't find {netcdf_file}!") # Join all DataFrames together as one self.data_df = pd.concat([self.data_df, pd.concat(df_list)], axis=1) # convert time stamp (index) into columns self.data_df['Year'] = self.data_df.index.year self.data_df['Month'] = self.data_df.index.month self.data_df['Day'] = self.data_df.index.day self.data_df['Hour'] = self.data_df.index.hour # put the time-related columns first cols = self.data_df.columns.tolist() self.data_df = self.data_df[cols[-4:] + cols[:-4]] def calculate_wind_parameters(self): """ Calculate wind speed and direction based on the U and V components of wind speed, and drop those columns Conversion of m/s to knots needed (if specified by user) Source: https://confluence.ecmwf.int/pages/viewpage.action?pageId=133262398 :return: add a column for wind speed to the DataFrame """ # Calculate wind speed at 10m height if self.get_wind_speed_10m: self.data_df.loc[:, self.c_wind_speed_10m] = \ np.sqrt((self.data_df['U_wind_10m'] ** 2) + (self.data_df['V_wind_10m'] ** 2)) # convert wind speed in m/s to knots if self.wind_speed_in_knots: self.data_df[self.c_wind_speed_10m] = self.data_df[self.c_wind_speed_10m] * 1.9438444924406 # Calculate wind speed at 100m height if self.get_wind_speed_100m: self.data_df.loc[:, self.c_wind_speed_100m] = \ np.sqrt((self.data_df['U_wind_100m'] ** 2) + (self.data_df['V_wind_100m'] ** 2)) # convert wind speed in m/s to knots if self.wind_speed_in_knots: self.data_df[self.c_wind_speed_100m] = self.data_df[self.c_wind_speed_100m] * 1.9438444924406 # Calculate wind direction at 10m height (direction wind is blowing from, in degrees) if self.get_wind_direction_10m: self.data_df.loc[:, self.c_wind_direction_10m] = \ (180 + (np.degrees(np.arctan2(self.data_df['U_wind_10m'], self.data_df['V_wind_10m'])))) % 360 # Calculate wind direction at 100m height (direction wind is blowing from, in degrees) if self.get_wind_direction_100m: self.data_df.loc[:, self.c_wind_direction_100m] = \ (180 + (np.degrees(np.arctan2(self.data_df['U_wind_100m'], self.data_df['V_wind_100m'])))) % 360 # Delete unused columns if self.get_wind_speed_10m or self.get_wind_direction_10m: self.data_df.drop(['U_wind_10m', 'V_wind_10m'], axis=1, inplace=True) if self.get_wind_speed_100m or self.get_wind_direction_100m: self.data_df.drop(['U_wind_100m', 'V_wind_100m'], axis=1, inplace=True) def print_data_to_csv(self): """ Print the output DataFrame to CSV """ print("Printing CSV file...") output_file = f"Loc{str(self.site_coordinates[0])}_{str(self.site_coordinates[1])}_Yr{self.start_year}-" \ f"{self.end_year}_{self.site_name}.csv" output_file = os.path.join(self.output_csv_dir, output_file) self.data_df.index.rename('TimeStamp', inplace=True) self.data_df.to_csv(output_file, index=True) # don't print time stamp as index def delete_netcdf_files(self): """ Delete all the files in the netcdf directory and delete the directory itself """ # Delete all files for root, dirs, files in os.walk(self.netcdf_dir): for file in files: os.remove(os.path.join(root, file)) # Delete directory os.rmdir(self.netcdf_dir) if __name__ == "__main__": start_time = time.time() extract_data = ExtractCDSData() print(f"Run-time: --- {(time.time() - start_time) / 60.} minutes ---")