"""
cca.py: user inputs the month of choice and the forecast lead time. program
then loads in the HUC precipitation and the ERA-5 analysis data,
and it munges the data to make it suitable for Canonical Correlation Analysis.
It calls scipy.stats library routine to perform the CCA and saves output.
"""

from netCDF4 import Dataset
import numpy as np
from dateutils import daterange, datetohrs, dateshift
from datetime import datetime
import sys
import os
from os import path
import numpy.ma as ma
import _pickle as cPickle
import matplotlib.pyplot as plt
from matplotlib import rcParams
from mpl_toolkits.basemap import Basemap
import scipy.stats as stats
from sklearn.cross_decomposition import PLSRegression

# ==================================================================
    
def load_era5(cyear, cmonth, cvar):
    """ read the ERA5 reanalysis data at 4 vertical levels, 
    NHem, averaged to 5 degrees """
    infile = 'cca/'+cyear+cmonth+'_'+cvar+'_era5_5deg.cPick'
    #print (infile)
    inf = open(infile,'rb')
    input_data = cPickle.load(inf)
    ntimes, nlevels, ny, nx = np.shape(input_data)
    lon = cPickle.load(inf)
    lat = cPickle.load(inf)
    yyyymmddhh = cPickle.load(inf)
    inf.close()
    return yyyymmddhh, input_data, ntimes, nlevels, ny, nx, lon, lat
    
# ==================================================================
    
def convert_to_yyyymmddhh(precipDates):
    
    # --- convert Matt's HUC date array to yyyymmddhh format, making the 
    #     assumption that the 7am local time is close enough to 12Z.
    
    npdates, nymd = np.shape(precipDates)
    yyyymmddhh = []
    #print ('npdates, nymd = ', npdates, nymd)
    for i in range(npdates):
        
        yyyy = str(precipDates[i,0])
        imm = precipDates[i,1]
        if imm < 10:
            cmm = '0'+str(imm)
        else:
            cmm = str(imm)
        idd = precipDates[i,2]
        if idd < 10:
            cdd = '0'+str(idd)
        else:
            cdd = str(idd)
        yyyymmddhh.append(int(yyyy+cmm+cdd+'12'))
        #print (precipDates[i,0], precipDates[i,1], \
        #precipDates[i,2], int(yyyy+cmm+cdd+'12'))
        #if i == 1000: sys.exit()
    return yyyymmddhh
    
# ==================================================================

def compute_n_sampledays(cmonth):
    """ compute the number of days of samples """    

    imonth = int(cmonth)
    if imonth == 1 or imonth == 3 or imonth == 5 or imonth == 7 \
    or imonth == 8 or imonth == 10 or imonth == 12:
        ndays_total = 31*nyears
    elif imonth == 4 or imonth == 6 or imonth == 9 or imonth == 11:
        ndays_total = 30*nyears 
    elif imonth == 2:
        ndays_total = 9*29 + (nyears-9)*28 
    return ndays_total
    
# ==================================================================

def read_HUC_data():
            
    """ read in the HUC data provided by Matt Switanek """

    f1 = open("cca/PRISM_pr_hucs_19810101-20180930.pickle",'rb')
    precipHUCs = cPickle.load(f1) #the daily precip accum in mm (days,hucs)
    ndayhucs, nhucs = np.shape(precipHUCs)
    precipDates = cPickle.load(f1) #the dates
    hucLats = cPickle.load(f1) #centroid lats of hucs
    hucLons = cPickle.load(f1) #centroid lons of hucs
    hucShapes = cPickle.load(f1) #embedded lists of huc boundaries
    hucDivision4 = cPickle.load(f1) #the div 4 numeric codes of the hucs
    f1.close()  
    return precipHUCs, ndayhucs, nhucs, precipDates, hucLats,\
         hucLons, hucShapes, hucDivision4      
        
# ==================================================================

def control_load_era5(cyears, cmonth):
    """ ERA5 was upscaled on Tom Hamill's mac24 in cca directory from
    Cathy Smith data store of ERA5 data, copied to /Public/thamill, 
    then ftped to Tom's home computer """

    ktr = 0
    for iyear, cyear in enumerate(cyears):
    
        yyyymmddhh, temp, ntimes, nlevels, ny, nx, lon, lat = \
            load_era5(cyear, cmonth, 'air')
        yyyymmddhh, shum, ntimes, nlevels, ny, nx, lon, lat = \
            load_era5(cyear, cmonth, 'shum')    
        yyyymmddhh, uwnd, ntimes, nlevels, ny, nx, lon, lat = \
            load_era5(cyear, cmonth, 'uwnd')    
        yyyymmddhh, vwnd, ntimes, nlevels, ny, nx, lon, lat = \
            load_era5(cyear, cmonth, 'vwnd') 
                
        if iyear == 0:
            zero_store = np.zeros((ntimes, nlevels, ny, nx), dtype=np.float32)
            temp_store = np.zeros((ndays_total,4,ny//2, nx), dtype=np.float64) 
            shum_store = np.zeros((ndays_total,4,ny//2, nx), dtype=np.float64)
            uwnd_store = np.zeros((ndays_total,4,ny//2, nx), dtype=np.float64)
            vwnd_store = np.zeros((ndays_total,4,ny//2, nx), dtype=np.float64)
            yyyymmddhh_store = np.zeros((ndays_total), dtype=np.int32)
                
        shum = np.where(shum < 0.0, zero_store, shum)
        temp_store[ktr:ktr+ntimes,:,:,:] = temp[:,:,0:ny//2,:]
        shum_store[ktr:ktr+ntimes,:,:,:] = shum[:,:,0:ny//2,:]**0.3333
        uwnd_store[ktr:ktr+ntimes,:,:,:] = uwnd[:,:,0:ny//2,:]
        vwnd_store[ktr:ktr+ntimes,:,:,:] = vwnd[:,:,0:ny//2,:]
        yyyymmddhh_store[ktr:ktr+ntimes] = yyyymmddhh[:]
        ktr = ktr + ntimes 
    return zero_store, temp_store, shum_store, uwnd_store, \
        vwnd_store, yyyymmddhh_store, ktr, nx, ny//2,        

# ==================================================================

def standard_normal(data_store,ktr):
    """ computed standardized anomalies, return in same data structure """
    
    data_stddev = np.std(data_store, axis=0)
    data_mean = np.mean(data_store, axis=0)
    for idate in range(ktr):
        data_store[idate,:,:,:] = (data_store[idate,:,:,:] - data_mean[:,:,:]) / data_stddev[:,:,:]
    return data_store
    
# ==================================================================
       
def munge_predictors_predictand_to_2darray(nx, ny, ktr, nhucs, temp_store, \
    shum_store, uwnd_store, vwnd_store, precipHUCs, yyyymmddhh_hucs, \
    yyyymmddhh_store, ifhour):       

    # --- rearrange data; first index is time, second index is 1-d composite vector
    #     of the standardized temperature, specific humidity, and u- and v-wind 
    #     components

    ngrid = nx*ny*4
    X = np.zeros((ktr, ngrid*4), dtype=np.float64) # 4 as there are four fields
    Y = np.zeros((ktr, nhucs), dtype=np.float64)
    print ('ktr = ', ktr)
    for idate in range(ktr):
        temp_1d = np.reshape(temp_store[idate,:,:,:], ngrid)
        shum_1d = np.reshape(shum_store[idate,:,:,:], ngrid)
        uwnd_1d = np.reshape(uwnd_store[idate,:,:,:], ngrid)
        vwnd_1d = np.reshape(vwnd_store[idate,:,:,:], ngrid)
        X[idate,0:ngrid] = temp_1d[:]
        X[idate,ngrid:2*ngrid] = shum_1d[:]
        X[idate,2*ngrid:3*ngrid] = uwnd_1d[:]
        X[idate,3*ngrid:4*ngrid] = vwnd_1d[:]
                
        # --- for an n.5 -day forecast, pluck the HUC data offset by +n.5 days.
        #     got to get this data individually for the given month of each year.
        #     yyyymmddhh_store has the vector of initial condition dates.
        #     yyyymmddhh_hucs has the vector of HUC verification period (end)
    
        #print ('str(yyyymmddhh_store[idate] = ', str(yyyymmddhh_store[idate]))
        fcst_date = int(dateshift(str(yyyymmddhh_store[idate]), ifhour))
        timeindex = yyyymmddhh_hucs.index(fcst_date)
        Y[idate,:] = precipHUCs[timeindex,:]**0.3333
        
    return X, Y

# ==================================================================

def separate_train_validation (X, Y, ixval, nxval, ktr):

    """separate data into training and validation parts
    for cross validation """
    
    ixstart = (ktr*ixval) // nxval
    ixend = ixstart + ktr//nxval
    ival = [*range(ixstart, ixend)]
    nval = len(ival)
    itrain = [*range(ktr)]
    del itrain[ixstart:ixend]
    ntrain = len(itrain)

    #ival = range(ixval,ktr,nxval)
    #itrain = []
    #for ix in range(nxval):
    #    if itrain != ixval:
    #        itrain = itrain + range(ix,ktr,nxval)
    #ntrain = len(itrain)
    
    Xtrain = X[itrain,:]
    Ytrain = Y[itrain,:]
    Xval = X[ival,:]
    Yval = Y[ival,:]

    return Xtrain, Xval, Ytrain, Yval, ntrain, \
        nval, itrain, ival

# ==================================================================
        
# --- get inputs from command line
#cmonth = sys.argv[1] # 01 to 12
#clead = sys.argv[2] # forecast lead time in days, e.g., 2.5   Use half days so the 00 UTC
cyears = \
    ['1981', '1982', '1983', '1984','1985', '1986', \
    '1987', '1988', '1989', '1990', '1991', '1992', \
    '1993', '1994', '1995', '1996', '1997', '1998', \
    '1999', '2000', '2001', '2002', '2003', '2004', \
    '2005', '2006', '2007', '2008', '2009', '2010', \
    '2011', '2012', '2013', '2014', '2015', '2016'] # , '2017']
nyears = len(cyears)
   
#for cmonth in ['01','02','03','04','05','06','07','08','09','10','11']:

# --- perform cross validation over 6 groups of 6 years.

nxval = 6
for cmonth in ['03']:
    ndays_total = compute_n_sampledays(cmonth)
    for clead in ['1.5']:
        ifhour = int(float(clead)*24)
        for ixval in range(nxval): 
            
            # ---- read analyzed precipitation in HUC4 boundaries. 
            
            precipHUCs, ndayhucs, nhucs, precipDates, hucLats,\
                hucLons, hucShapes, hucDivision4 = read_HUC_data()

            # ---- convert the precipitation dates into yyyymmddhh format used w. ERA5 data

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print ('Converting HUC precipitation dates.  Current time is ', current_time)
            yyyymmddhh_hucs = convert_to_yyyymmddhh(precipDates)

            # ---- for the chosen month, load the ERA5 analysis data over the multiple 
            #      years

            zero_store, temp_store, shum_store, uwnd_store, vwnd_store, \
                yyyymmddhh_store, ktr, nx, ny = control_load_era5(cyears, cmonth)
    
            # --- convert to a standard normal deviate.

            temp_store = standard_normal(temp_store, ktr)
            shum_store = standard_normal(shum_store, ktr)
            uwnd_store = standard_normal(uwnd_store, ktr)
            vwnd_store = standard_normal(vwnd_store, ktr)

            X, Y = munge_predictors_predictand_to_2darray \
                (nx, ny, ktr, nhucs, temp_store, \
                shum_store, uwnd_store, vwnd_store, precipHUCs, \
                yyyymmddhh_hucs, yyyymmddhh_store, ifhour)      

            # --- apply standard deviation to Y, HUC data  
    
            Ystd = np.std(Y,axis=0)
            Ymean = np.mean(Y,axis=0)
            for idate in range(ktr):
                Y[idate,:] = (Y[idate,:]-Ymean[:]) / Ystd[:]

            # --- separate the data into training and validation parts
            
            Xtrain, Xval, Ytrain, Yval, ntrain, nval, itrain, ival = \
                separate_train_validation (X, Y, ixval, nxval, ktr)

            # --- perform the partial-least-squares regression on the
            #     training data.

            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print ("Starting PLSR.  Current time = ", current_time)
            plsr = PLSRegression(n_components=160, scale=False)
            plsr.fit(Xtrain,Ytrain)
            Ypred = plsr.predict(Xval)
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print ("Finished PLSR.  Current time = ", current_time)
            
            # ---- reconstitute the full vector of forecast data, including
            #      back transformation to original space.

            if ixval == 0: Ypred_full = np.copy(Y)
            print ('np.shape(Ypred_full[ival,:]) = ', np.shape(Ypred_full[ival,:]))
            print ('np.shape(Ypred[:,:]) = ', np.shape(Ypred[:,:]))
            print ('np.shape(Ystd[:]) = ', np.shape(Ystd[:]))
            print ('np.shape(Ymean[:]) = ', np.shape(Ymean[:]))
            
            Ypred_full[ival,:] = (Ypred[:,:]*Ystd[:] + Ymean[:])**3.0
            
        # ---- save cross-validated predicted precipitation forecasts to file

        outfile = 'cca/PLSR_regression_data_month='+cmonth+'_lead='+clead+'days.cPick'
        print ('writing to ', outfile)
        ouf = open(outfile, 'wb')
        cPickle.dump(Ypred_full, ouf)
        cPickle.dump(yyyymmddhh_store, ouf)
        ouf.close()