""" cca.py: user inputs the month of choice and the forecast lead time. program then loads in the HUC precipitation and the ERA-5 analysis data, and it munges the data to make it suitable for Canonical Correlation Analysis. It calls scipy.stats library routine to perform the CCA and saves output. """ from netCDF4 import Dataset import numpy as np from dateutils import daterange, datetohrs, dateshift from datetime import datetime import sys import os from os import path import numpy.ma as ma import _pickle as cPickle import matplotlib.pyplot as plt from matplotlib import rcParams from mpl_toolkits.basemap import Basemap import scipy.stats as stats from sklearn.cross_decomposition import PLSRegression rcParams['xtick.labelsize']='x-small' rcParams['ytick.labelsize']='x-small' rcParams['legend.fontsize']='xx-small' rcParams['legend.fancybox']=True #def load_latlon(): # infile = 'cca/199901_air_era5.nc' # nc = Dataset(infile) # lat = nc.variables['lat'][:] # lon = nc.variables['lon'][:] # nc.close() def load_era5(cyear, cmonth, cvar): infile = 'cca/'+cyear+cmonth+'_'+cvar+'_era5.cPick' #print (infile) inf = open(infile,'rb') input_data = cPickle.load(inf) ntimes, nlevels, ny, nx = np.shape(input_data) lon = cPickle.load(inf) lat = cPickle.load(inf) yyyymmddhh = cPickle.load(inf) inf.close() return yyyymmddhh, input_data, ntimes, nlevels, ny, nx, lon, lat def convert_to_yyyymmddhh(precipDates): # --- convert Matt's HUC date array to yyyymmddhh format, making the # assumption that the 7am local time is close enough to 12Z. npdates, nymd = np.shape(precipDates) yyyymmddhh = [] #print ('npdates, nymd = ', npdates, nymd) for i in range(npdates): yyyy = str(precipDates[i,0]) imm = precipDates[i,1] if imm < 10: cmm = '0'+str(imm) else: cmm = str(imm) idd = precipDates[i,2] if idd < 10: cdd = '0'+str(idd) else: cdd = str(idd) yyyymmddhh.append(int(yyyy+cmm+cdd+'12')) #print (precipDates[i,0], precipDates[i,1], precipDates[i,2], int(yyyy+cmm+cdd+'12')) #if i == 1000: sys.exit() return yyyymmddhh # --- get inputs from command line #cmonth = sys.argv[1] # 01 to 12 #clead = sys.argv[2] # forecast lead time in days, e.g., 2.5 Use half days so the 00 UTC cyears = ['1981', '1982', '1983', '1984','1985', '1986', '1987', '1988', '1989', '1990', \ '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', \ '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011'] nyears = len(cyears) #for cmonth in ['01','02','03','04','05','06','07','08','09','10','11']: for cmonth in ['03']: for clead in ['1.5']: # ---- ECMWF initial dates line up with the 12 UTC HUC dates. ifhour = int(float(clead)*24) print ('forecast lead in hours: ', ifhour) imonth = int(cmonth) if imonth == 1 or imonth == 3 or imonth == 5 or imonth == 7 \ or imonth == 8 or imonth == 10 or imonth == 12: ndays_total = 31*nyears elif imonth == 4 or imonth == 6 or imonth == 9 or imonth == 11: ndays_total = 30*nyears elif imonth == 2: ndays_total = 7*29 + (nyears-7)*28 # --- read in the HUC data provided by Matt Switanek now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('Reading HUC data. Current time is ', current_time) f1 = open("PRISM_pr_hucs_19810101-20180930.pickle",'rb') precipHUCs = cPickle.load(f1) #the daily precip accumulations in mm (days,hucs) ndayhucs, nhucs = np.shape(precipHUCs) precipDates = cPickle.load(f1) #the dates hucLats = cPickle.load(f1) #centroid lats of hucs hucLons = cPickle.load(f1) #centroid lons of hucs hucShapes = cPickle.load(f1) #embedded lists of huc boundaries hucDivision4 = cPickle.load(f1) #the division 4 numeric codes of the hucs #print ('np.shape(precipHUCs) = ', np.shape(precipHUCs)) #print ('precipDates[0:-1:100] = ', precipDates[0:-1:100]) # [2012 3 19] #print ('hucLons = ', hucLons) # negative for west #print ('hucLats = ', hucLats) #print ('hucShapes.__doc__ = ',hucShapes.__doc__ ) #print ('hucDivision4 = ', hucDivision4) f1.close() #for i in range(len(hucLons)): # print (i,hucLons[i],hucLats[i]) #sys.exit() # ---- convert the precipitation dates into yyyymmddhh format now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('Converting HUC precipitation dates. Current time is ', current_time) yyyymmddhh_hucs = convert_to_yyyymmddhh(precipDates) #print ('HUC dates = ', yyyymmddhh_hucs[0:100]) #sys.exit() # ---- for the chosen month, load the ERA5 analysis data over the multiple # years temp_store = np.zeros((ndays_total,4,45,180), dtype=np.float64) # 4 levels, 45 lats, 180 lons shum_store = np.zeros((ndays_total,4,45,180), dtype=np.float64) uwnd_store = np.zeros((ndays_total,4,45,180), dtype=np.float64) vwnd_store = np.zeros((ndays_total,4,45,180), dtype=np.float64) yyyymmddhh_store = np.zeros((ndays_total), dtype=np.int32) ktr = 0 for iyear, cyear in enumerate(cyears): now = datetime.now() current_time = now.strftime("%H:%M:%S") #print ("Loading ERA-5 reanalysis data from disk for year ",cyear,". Current time = ", current_time) yyyymmddhh, temp, ntimes, nlevels, ny, nx, lon, lat = load_era5(cyear, cmonth, 'air') yyyymmddhh, shum, ntimes, nlevels, ny, nx, lon, lat = load_era5(cyear, cmonth, 'shum') yyyymmddhh, uwnd, ntimes, nlevels, ny, nx, lon, lat = load_era5(cyear, cmonth, 'uwnd') yyyymmddhh, vwnd, ntimes, nlevels, ny, nx, lon, lat = load_era5(cyear, cmonth, 'vwnd') zero_store = np.zeros((ntimes, nlevels, ny, nx), dtype=np.float32) shum = np.where(shum < 0.0, zero_store, shum) #print (yyyymmddhh) #print ('ntimes = ', ntimes) temp_store[ktr:ktr+ntimes,:,:,:] = temp[:,:,0:45,:] shum_store[ktr:ktr+ntimes,:,:,:] = shum[:,:,0:45,:]**0.3333 uwnd_store[ktr:ktr+ntimes,:,:,:] = uwnd[:,:,0:45,:] vwnd_store[ktr:ktr+ntimes,:,:,:] = vwnd[:,:,0:45,:] yyyymmddhh_store[ktr:ktr+ntimes] = yyyymmddhh[:] ktr = ktr + ntimes # --- determine the standard deviation across all time samples, and normalize by this. #print ('yyyymmddhh_store from ERA5 = ',yyyymmddhh_store[0:100]) now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('Determining standard deviation and dividing by this. Current time = ', current_time) temp_stddev = np.std(temp_store, axis=0) shum_stddev = np.std(shum_store, axis=0) uwnd_stddev = np.std(uwnd_store, axis=0) vwnd_stddev = np.std(vwnd_store, axis=0) temp_mean = np.mean(temp_store, axis=0) shum_mean = np.mean(shum_store, axis=0) uwnd_mean = np.mean(uwnd_store, axis=0) vwnd_mean = np.mean(vwnd_store, axis=0) print ('max, min temp_mean = ', np.max(temp_mean), np.min(temp_mean)) print ('max, min temp_stddev = ', np.max(temp_stddev), np.min(temp_stddev)) for idate in range(ktr): temp_store[idate,:,:,:] = (temp_store[idate,:,:,:] - temp_mean[:,:,:]) / temp_stddev[:,:,:] shum_store[idate,:,:,:] = (shum_store[idate,:,:,:] - shum_mean[:,:,:]) / shum_stddev[:,:,:] uwnd_store[idate,:,:,:] = (uwnd_store[idate,:,:,:] - uwnd_mean[:,:,:]) / uwnd_stddev[:,:,:] vwnd_store[idate,:,:,:] = (vwnd_store[idate,:,:,:] - vwnd_mean[:,:,:]) / vwnd_stddev[:,:,:] print ('max, min temp_store after normalization = ', np.max(temp_store), np.min(temp_store)) print ('max, min shum_store after normalization = ', np.max(shum_store), np.min(shum_store)) print ('max, min uwnd_store after normalization = ', np.max(uwnd_store), np.min(uwnd_store)) print ('max, min vwnd_store after normalization = ', np.max(vwnd_store), np.min(vwnd_store)) # --- rearrange data; first index is time, second index is 1-d composite vector # of the standardized temperature, specific humidity, and u- and v-wind # components now = datetime.now() current_time = now.strftime("%H:%M:%S") print ('Munging data into X, Y arrays. Current time = ', current_time) ngrid = nx*45*4 X = np.zeros((ktr, ngrid*4), dtype=np.float64) # 4 as there are four fields Y = np.zeros((ktr, nhucs), dtype=np.float64) #Y = np.zeros((ktr, 1), dtype=np.float64) print ('ktr = ', ktr) for idate in range(ktr): if idate%30 == 0: now = datetime.now() current_time = now.strftime("%H:%M:%S") #print ('Processing idate = ',idate,' of ',ktr,'. Current time = ', current_time) temp_1d = np.reshape(temp_store[idate,:,:,:], ngrid) shum_1d = np.reshape(shum_store[idate,:,:,:], ngrid) uwnd_1d = np.reshape(uwnd_store[idate,:,:,:], ngrid) vwnd_1d = np.reshape(vwnd_store[idate,:,:,:], ngrid) #print ('idate, max, min temp_1d = ',idate,np.max(temp_1d), np.min(temp_1d)) #print ('idate, max, min shum_1d = ',idate,np.max(shum_1d), np.min(shum_1d)) #print ('idate, max, min uwnd_1d = ',idate,np.max(uwnd_1d), np.min(uwnd_1d)) #print ('idate, max, min vwnd_1d = ',idate,np.max(vwnd_1d), np.min(vwnd_1d)) X[idate,0:ngrid] = temp_1d[:] X[idate,ngrid:2*ngrid] = shum_1d[:] X[idate,2*ngrid:3*ngrid] = uwnd_1d[:] X[idate,3*ngrid:4*ngrid] = vwnd_1d[:] # --- for an n.5 -day forecast, pluck the HUC data offset by +n.5 days. # got to get this data individually for the given month of each year. # yyyymmddhh_store has the vector of initial condition dates. # yyyymmddhh_hucs has the vector of HUC verification period (end) #print ('str(yyyymmddhh_store[idate] = ', str(yyyymmddhh_store[idate])) fcst_date = int(dateshift(str(yyyymmddhh_store[idate]), ifhour)) if idate%30 == 0: now = datetime.now() current_time = now.strftime("%H:%M:%S") #print ('Finding HUC data for date = ', fcst_date,'. Current time is ', current_time) timeindex = yyyymmddhh_hucs.index(fcst_date) Y[idate,:] = precipHUCs[timeindex,:]**0.3333 # --- apply standard deviation to Y, HUC data print ('max, min X = ', np.max(X), np.min(X)) print ('max, min Y after power transform ', np.max(Y), np.min(Y)) Ystd = np.std(Y,axis=0) Ymean = np.mean(Y,axis=0) for idate in range(ktr): Y[idate,:] = (Y[idate,:]-Ymean[:]) / Ystd[:] print ('max, min Y after normalization', np.max(Y), np.min(Y)) # --- perform the canonical correlation analysis now = datetime.now() current_time = now.strftime("%H:%M:%S") print ("Performing CCA. Current time = ", current_time) plsr = PLSRegression(n_components=160, scale=False) plsr.fit(X,Y) Ypred = plsr.predict(X) print ('max, min Y = ', np.max(Y), np.min(Y)) print ('max, min Ypred = ', np.max(Ypred), np.min(Ypred)) for i in range(ktr): print (i,Ypred[i,163], Y[i,163]) X_c, Y_c = plsr.transform(X, Y) print ('shape X_c, Y_c = ', np.shape(X_c), np.shape(Y_c)) now = datetime.now() current_time = now.strftime("%H:%M:%S") print ("Done! Current time = ", current_time) # ---- save to file outfile = 'cca/cca_data_month='+cmonth+'_lead='+clead+'days.cPick' print ('writing to ', outfile) ouf = open(outfile, 'wb') cPickle.dump(Ystd, ouf) cPickle.dump(Ymean, ouf) cPickle.dump(X_c, ouf) cPickle.dump(Y_c, ouf) cPickle.dump(Ypred, ouf) cPickle.dump(yyyymmddhh_store, ouf) ouf.close()