"""
Author: Anthony Gray, ORE Catapult
Date: 10 July 2020
Modified: 31 May 2021 (prepared for public release)
Purpose: Extract ERA5 weather time-series data to CSV for any location (1979+ available, 0.5 decimal degree resolution)

Warning: This process can take a long time - e.g. a 30 year dataset with wind speed, Hs and Tp will take 10 hours

Accompanying documentation:
https://ore.catapult.org.uk/analysisinsight/weather-time-series-data-in-offshore-renewables/

__________ Instructions for using this script __________

1. Re-save the script in a folder as .py file

2.	Download Python 3 via Anaconda
    - Link: https://www.anaconda.com/products/individual
    - Go to the Anaconda website and download Python 3 (whichever version is most recent)
    - Stick with any recommended settings throughout the installation
    - Remember where you’ve installed it because you need that ‘path’ in the next step
    - e.g. mine is 'C:/Users/anthony.gray/AppData/Local/Continuum/anaconda3' (use back slashes, not forward slashes
    like the example)

3.	Register with Copernicus and add the CDS configuration
    - Register an account: https://cds.climate.copernicus.eu/cdsapp#!/home
    - You then need to follow the Step-by-step guide on this page:
    https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+Windows
    - On prerequisites, you’ve done point 1, and just ignore points 2 and 3
    - To complete prerequisites point 4, open Anaconda Prompt (you can find it by typing into your toolbar) [*3b]
    - Then type in ‘conda config --add channels conda-forge’ and press enter
    - When that is done, type in ‘conda install cdsapi’ and press enter, when prompted press ‘y’ and enter
    - Now on the Step-by-step guide, follow the points. Some info which may help:
        - You’ll need to create the ‘.cdsapirc’ file:
        https://confluence.ecmwf.int/pages/viewpage.action?pageId=139068264
        - Put the ‘.cdsapirc’ file in your C drive – e.g. mine is in ‘C:/Users/anthony.gray’ – and copy in the two-line
        code (back slashes)
        - For point 4, remember you may need to change directory [*3b] in the Anaconda prompt before attempting the pip
        commands – ‘pip3 install cdsapi’ or ‘pip3 install --user cdsapi’
        - On point 5, you may need to accept the Copernicus terms and conditions if not already done
        (https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products)

*3b - if Anaconda prompt gives an error at this point then you will likely need to change the 'path'
    - Type in ‘cd ‘ followed by the path where your Python executable is – e.g. I would type
    ‘cd C:/Users/anthony.gray/AppData/Local/Continuum/anaconda3’ (back slashes)

4.	Install xarray
    - You need the xarray package to make the code work
    - Open your Anaconda Prompt again, change directory like before, then type in ‘conda install -c anaconda xarray’
    and press enter, when prompted press ‘y’ and enter
    - I’ve had issues when doing this on my personal laptop – I had to ‘conda uninstall numpy’ twice before being able
    to run my code – but hopefully the above points will suffice for you.

5.	Open the script in an Integrated Development Environments (IDE)
    - If you are a new user of Python then I recommend you use Spyder as your IDE (already installed through Anaconda)
        - Open Spyder by searching for it in your toolbar
    - Open the Python script in your IDE

6.	Define your user settings in the script
    - You only need to change variables in the USER DEFINED INPUTS - HIGH PRIORITY section near the top of the script
    - I recommend you keep the 'site_name' quite short as the script will add the coordinates and years to the output
    filename
    - The ‘site_coordinates’ (latitude, longitude) need to be in decimal degrees and 0.5 degree resolution is required.
    - Set the ‘start_year’ and ‘end_year’ – remember that it is inclusive (so 2000 to 2001 will get two years of data)
    - Selecting your parameters should be self explanatory - you can select multiple parameters
    - If extracting wind speed, remember to set ‘wind_speed_in_knots’ to False if you want wind speed in m/s.
    Otherwise, leave as True if you want knots.

7.	Run the script.
    - Press F5 or the green ‘play’ button on the top toolbar (if using Spyder)
    - Remember that it takes a while to run, but you can do other things (e.g. Excel, Word etc.) whilst it is running
    - The output CSV file will be stored in the ‘Output_CSVs’ folder (a subfolder of where you've stored the script)
    - I suggest you run the script in one go, so leave your computer on overnight if needs be – it is possible to stop
    and re-start but I wouldn’t recommend it as it would involve fiddling with settings. Connection to the API is lost
    occasionally which delays the script, but it will complete in time.

________________________________________________________

Documentation for ERA5 hourly data (e.g. if you want to check units or alternative parameters):
"https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels?tab=overview"
"""

import time
import os
import cdsapi
import pandas as pd
import xarray as xr
import numpy as np


class ExtractCDSData(object):

    def __init__(self):
        """
        Location for user to define key inputs for extraction
        """

        """ USER DEFINED INPUTS - HIGH PRIORITY """  # Need to be checked and modified on a case by case basis
        # Define what you want your output file to be called (short as coordinates and years are in the name too)
        self.site_name = "NE1"
        # Define where you want data to be extracted - must use 0.5 degrees spatial resolution (check location on map)
        self.site_coordinates = (60.0, 0.0)  # latitude, longitude (both decimal degrees format)
        # Define what years to want to extract data between (from 1 Jan in start year to 31 dec in end year)
        self.start_year = 1991
        self.end_year = 2020
        # Define what data you want to extract
        self.get_wind_speed_10m = False  # Wind speed at 10m height, set as True or False
        self.get_wind_speed_100m = True  # Wind speed at 100m height, set as True or False
        self.wind_speed_in_knots = True  # True for knots, False for m/s
        self.get_wind_direction_10m = False  # Wind direction (from) at 10m height, set as True or False
        self.get_wind_direction_100m = False  # Wind direction (from) at 100m height, set as True or False
        self.get_significant_wave_height = True  # Hs, set as True or False
        self.get_peak_wave_period = True  # Tp, set as True or False
        self.get_wave_direction = False  # Set as True or False
        self.get_air_density = False  # Set as True or False
        self.get_temperature = False  # Set as True or False

        """ USER DEFINED INPUTS - LOW PRIORITY """  # Can be changed by user but will default to values otherwise
        # Column headers
        self.c_wind_speed_10m = 'wind_speed_10m'  # units are added depending of user selection for wind_speed_in_knots
        self.c_wind_speed_100m = 'wind_speed_100m'  # as above
        self.c_wind_direction_10m = 'wind_direction_10m_deg'
        self.c_wind_direction_100m = 'wind_direction_100m_deg'
        self.c_wave_height = 'wave_height_m'
        self.c_wave_period = 'wave_peak_period_s'
        self.c_wave_direction = 'wave_direction_deg'
        self.c_air_density = 'air_density_kgm-3'
        self.c_temperature = 'temperature_K'
        self.netcdf_dir = os.getcwd() + '\\Downloaded_NetCDFs'  # Folder will be created if it doesn't exist
        self.output_csv_dir = os.getcwd() + '\\Output_CSVs'  # Folder will be created if it doesn't exist
        self.process_data_only = False  # if you have already downloaded the netcdf files then set this to True
        self.delete_netcdfs_after_use = True  # True if you want to delete the netcdf files after conversion to csv

        """ NO USER INPUT NEEDED FROM HERE ONWARDS """

        """ CHECK INPUTS """
        self.check_site_coordinates()
        self.check_directories()

        """ OTHER """
        # Add units for wind speed column
        if self.wind_speed_in_knots:
            self.c_wind_speed_10m += '_kts'
            self.c_wind_speed_100m += '_kts'
        else:
            self.c_wind_speed_10m += '_mps'
            self.c_wind_speed_100m += '_mps'
        # API client
        self.api_c = cdsapi.Client()
        # Empty dataframe for data
        self.data_df = pd.DataFrame()
        # Map user inputs for data to era5 parameter names
        self.era5_parameters_names = self.map_parameters()
        # Initialise api request details dictionary
        self.api_request_details = {
            'variable': None,
            'product_type': 'reanalysis',
            'year': None,
            'month': [str(m).zfill(2) for m in range(1, 13)],
            'day': [str(d).zfill(2) for d in range(1, 32)],
            'time': [str(hr).zfill(2)+":00" for hr in range(24)],
            'format': 'netcdf',
            'area': None,
        }

        """ RUN FUNCTIONS """
        # Run extraction and data processing
        if not self.process_data_only:
            self.retrieve_era5_data()  # extract relevant data using API
        self.convert_netcdf_to_df()  # convert the downloaded NetCDF files to pandas DataFrames
        self.calculate_wind_parameters()  # need to use U and V component of wind to calculate wind speed and direction
        self.print_data_to_csv()  # print the data to csv
        if self.delete_netcdfs_after_use:
            self.delete_netcdf_files()  # if finished with the netcdf files then delete

    def check_site_coordinates(self):
        """
        Raise an error if the input site parameter are not 0.5 degree resolution (to match ERA5)
        """
        for i in self.site_coordinates:
            if i % 0.5 != 0:
                raise TypeError("Input site coordinates need to be 0.5 degree resolution")

    def check_directories(self):
        """
        Create the directories if they don't already exist
        """
        for this_dir in [self.netcdf_dir, self.output_csv_dir]:
            if not os.path.exists(this_dir):
                os.makedirs(this_dir)

    def map_parameters(self) -> dict:
        """
        Use the user inputs for desired parameters to map to the correct parameter name in the era5 dataset
        Note: wind speed and direction is calculated later from U and V components
        :return: a dict with keys as column names or identifiers with values as era5 parameter names
        """
        parameter_names = {}
        if self.get_wind_speed_10m or self.get_wind_direction_10m:
            parameter_names['U_wind_10m'] = '10m_u_component_of_wind'
            parameter_names['V_wind_10m'] = '10m_v_component_of_wind'
        if self.get_wind_speed_100m or self.get_wind_direction_100m:
            parameter_names['U_wind_100m'] = '100m_u_component_of_wind'
            parameter_names['V_wind_100m'] = '100m_v_component_of_wind'
        if self.get_significant_wave_height:
            parameter_names[self.c_wave_height] = 'significant_height_of_combined_wind_waves_and_swell'
        if self.get_peak_wave_period:
            parameter_names[self.c_wave_period] = 'peak_wave_period'
        if self.get_wave_direction:
            parameter_names[self.c_wave_direction] = 'mean_wave_direction'
        if self.get_air_density:
            parameter_names[self.c_air_density] = 'air_density_over_the_oceans'
        if self.get_temperature:
            parameter_names[self.c_temperature] = '2m_temperature'

        return parameter_names

    def retrieve_era5_data(self):
        """
        Go through each user-specified parameter and year and extract ERA5 data using the API (takes a long time!)
        return: use the api CDS client for each parameter and year and save in netcdf files
        """
        # Loop through all parameters in the era5 parameter names inputs
        for parameter, era5_parameter_name in self.era5_parameters_names.items():
            # Loop through all years specified by the user
            for year in range(self.start_year, self.end_year + 1):
                # Tell user where we are
                print(f"Extracting {parameter} in year {year}...")

                # Make a copy of the request details template and amend accordingly
                request_details = self.api_request_details.copy()
                request_details['variable'] = era5_parameter_name
                request_details['year'] = str(year)
                request_details['area'] = [str(self.site_coordinates[0]), str(self.site_coordinates[1]),
                                           str(self.site_coordinates[0]), str(self.site_coordinates[1])]

                # Call API and produce output file
                output_file = os.path.join(self.netcdf_dir, f"{parameter}_{year}.nc")
                if not os.path.exists(output_file):
                    self.api_c.retrieve('reanalysis-era5-single-levels', request_details, output_file)

    def convert_netcdf_to_df(self):
        """
        Convert the netcdf files to a single pandas DataFrame
        :return: update self.data_df to be a DataFrame storing all the data
        """
        print("Processing downloaded data...")

        # Loop through all parameters in the era5 parameter names inputs
        for parameter, era5_parameter_name in self.era5_parameters_names.items():
            df_list = []  # initialise list of DataFrames for this parameter

            # Loop through all years specified by the user
            for year in range(self.start_year, self.end_year + 1):
                # Get netcdf file for this parameter and year and open it
                netcdf_file = os.path.join(self.netcdf_dir, f"{parameter}_{year}.nc")
                if os.path.exists(netcdf_file):
                    data = xr.open_dataset(netcdf_file)

                    # Convert the data into a pandas DataFrame
                    df = data.to_dataframe()
                    df.index = df.index.droplevel().droplevel()  # don't need to keep the location coordinates in index
                    df.columns = [parameter]  # call the existing data the right name
                    df_list.append(df)

                else:
                    print(f"WARNING: can't find {netcdf_file}!")

            # Join all DataFrames together as one
            self.data_df = pd.concat([self.data_df, pd.concat(df_list)], axis=1)

        # convert time stamp (index) into columns
        self.data_df['Year'] = self.data_df.index.year
        self.data_df['Month'] = self.data_df.index.month
        self.data_df['Day'] = self.data_df.index.day
        self.data_df['Hour'] = self.data_df.index.hour
        # put the time-related columns first
        cols = self.data_df.columns.tolist()
        self.data_df = self.data_df[cols[-4:] + cols[:-4]]

    def calculate_wind_parameters(self):
        """
        Calculate wind speed and direction based on the U and V components of wind speed, and drop those columns
        Conversion of m/s to knots needed (if specified by user)
        Source: https://confluence.ecmwf.int/pages/viewpage.action?pageId=133262398
        :return: add a column for wind speed to the DataFrame
        """
        # Calculate wind speed at 10m height
        if self.get_wind_speed_10m:
            self.data_df.loc[:, self.c_wind_speed_10m] = \
                np.sqrt((self.data_df['U_wind_10m'] ** 2) + (self.data_df['V_wind_10m'] ** 2))
            # convert wind speed in m/s to knots
            if self.wind_speed_in_knots:
                self.data_df[self.c_wind_speed_10m] = self.data_df[self.c_wind_speed_10m] * 1.9438444924406

        # Calculate wind speed at 100m height
        if self.get_wind_speed_100m:
            self.data_df.loc[:, self.c_wind_speed_100m] = \
                np.sqrt((self.data_df['U_wind_100m'] ** 2) + (self.data_df['V_wind_100m'] ** 2))
            # convert wind speed in m/s to knots
            if self.wind_speed_in_knots:
                self.data_df[self.c_wind_speed_100m] = self.data_df[self.c_wind_speed_100m] * 1.9438444924406

        # Calculate wind direction at 10m height (direction wind is blowing from, in degrees)
        if self.get_wind_direction_10m:
            self.data_df.loc[:, self.c_wind_direction_10m] = \
                (180 + (np.degrees(np.arctan2(self.data_df['U_wind_10m'], self.data_df['V_wind_10m'])))) % 360

        # Calculate wind direction at 100m height (direction wind is blowing from, in degrees)
        if self.get_wind_direction_100m:
            self.data_df.loc[:, self.c_wind_direction_100m] = \
                (180 + (np.degrees(np.arctan2(self.data_df['U_wind_100m'], self.data_df['V_wind_100m'])))) % 360

        # Delete unused columns
        if self.get_wind_speed_10m or self.get_wind_direction_10m:
            self.data_df.drop(['U_wind_10m', 'V_wind_10m'], axis=1, inplace=True)
        if self.get_wind_speed_100m or self.get_wind_direction_100m:
            self.data_df.drop(['U_wind_100m', 'V_wind_100m'], axis=1, inplace=True)

    def print_data_to_csv(self):
        """
        Print the output DataFrame to CSV
        """
        print("Printing CSV file...")
        output_file = f"Loc{str(self.site_coordinates[0])}_{str(self.site_coordinates[1])}_Yr{self.start_year}-" \
                      f"{self.end_year}_{self.site_name}.csv"
        output_file = os.path.join(self.output_csv_dir, output_file)
        self.data_df.index.rename('TimeStamp', inplace=True)
        self.data_df.to_csv(output_file, index=True)  # don't print time stamp as index

    def delete_netcdf_files(self):
        """
        Delete all the files in the netcdf directory and delete the directory itself
        """
        # Delete all files
        for root, dirs, files in os.walk(self.netcdf_dir):
            for file in files:
                os.remove(os.path.join(root, file))
        # Delete directory
        os.rmdir(self.netcdf_dir)


if __name__ == "__main__":
    start_time = time.time()
    extract_data = ExtractCDSData()
    print(f"Run-time: --- {(time.time() - start_time) / 60.} minutes ---")