def CDF_fitting_ccpa_precip_spline(cmonth, cend_hour): """ CDF_fitting_ccpa_precip_spline.py cmonth cend_hour this python script is designed to spline fit an empirical CDF of precipitation. The script is tailored to CCPA precipitation analyses over one of the National Digital Forecast Database domains for the National Blend of Models Designed by Tom Hamill, NOAA, with help from Michael Scheuerer, Jan 2021 """ import os, sys from datetime import datetime import numpy as np import _pickle as cPickle from netCDF4 import Dataset import _pickle as cPickle import scipy.stats as stats import scipy.stats as stats from scipy.interpolate import splrep, splev # ===================================================================== def set_domain_boundaries(cdomain): """ used grib file of 2.5-km blend output grid to determine bounding lat and lon, and from that, the domain bounding indices for the 0.25 GEFSv12 reforecast data that will encompass the domain. """ if cdomain == 'conus': jmin = 93 jmax = 246 imin = 368 imax = 686 elif cdomain == 'pr': jmin = 243 jmax = 256 imin = 649 imax = 667 elif cdomain == 'ak': jmin = 19 jmax = 161 imin = 201 imax = 967 else: print ('invalid domain. Exiting.') sys.exit() return jmin, jmax, imin, imax # ===================================================================== def find_nearest(vec, value): """ given a vector vec and a particular value, find the index in vec that is nearest to value""" idx = np.abs(vec-value).argmin() return idx # ===================================================================== def fraczero_possamps(nsamps, precip_ens): """ from the vector input sample precip_ens, define the fraction of samples with zero precipitation. For the positive samples, add a small random number to deal with the fact that the data was discretized to ~0.1 mm, so that when later creating CDFs we don't have empirical values with lots of tied amounts. Also, sort the nonzero amounts and return. """ precip_ens_nonzero = np.delete(precip_ens, \ np.where(precip_ens <= 0.0)) # censor at 0.0 mm precip_ens_nonzero = precip_ens_nonzero + \ np.random.uniform(low=-0.01,high=0.01,size=len(precip_ens_nonzero)) precip_ens_nonzero = np.delete(precip_ens_nonzero, \ np.where(precip_ens_nonzero <= 0.0)) # censor at 0.0 mm nz = len(precip_ens_nonzero) #precip_ens_nonzero = precip_ens_nonzero + \ # np.random.uniform(low=-0.005,high=0.005,size=nz) precip_ens_nonzero = np.sort(precip_ens_nonzero) ntotal = len(precip_ens) nzero = ntotal - len(precip_ens_nonzero) fraction_zero = float(nzero) / float(ntotal) return fraction_zero, precip_ens_nonzero, nz # ===================================================================== # ---- inputs from command line #cmonth = sys.argv[1] # '01', '02' etc. #cend_hour = sys.argv[2] # 06, 12, 18, 00 -- end hour of 6-h period print ('cmonth, cend_hour = ', cmonth, cend_hour) imonth = int(cmonth) - 1 nstride = 1 # do every point cdomain = 'conus' # ---- set directories, constants pflag = True # for print statements #master_directory = '/Volumes/Backup Plus/ccpa/' master_directory = '/Volumes/NBM/'+cdomain+'_panal/' master_directory_out = '/Volumes/NBM/'+cdomain+'_panal/CDF_spline/' ndaysomo = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] ndaysomo_leap = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] cmonths_early = ['12','01','02','03','04','05','06','07','08','09','10','11'] cmonths_late = ['02','03','04','05','06','07','08','09','10','11','12','01'] # ---- determine the overall number of daily precipitation # samples across all years for this month and the surrounding # two months iearly = int(cmonths_early[imonth])-1 ilate = int(cmonths_late[imonth])-1 if imonth != 1: # not Feb nsamps_mid = ndaysomo[imonth]*18 else: nsamps_mid = 4*ndaysomo_leap[imonth] + 14*ndaysomo[imonth] if iearly != 1: # not Feb nsamps_early = ndaysomo[iearly]*20 else: nsamps_early = 4*ndaysomo_leap[iearly] + 14*ndaysomo[iearly] if ilate != 1: # not Feb nsamps_late = ndaysomo[ilate]*20 else: nsamps_late = 4*ndaysomo_leap[ilate] + 14*ndaysomo[ilate] nsamps = nsamps_mid + nsamps_early + nsamps_late print ('nsamps = ', nsamps) # ---- read in the previously generated netCDF file with precipitation # for this month and lead time as well as the surrounding # two months. All dates for this month have # been smushed into one leading index, dimension nsamps, # since the date of the forecast within the month and # the member number is irrelevant for the distribution # fitting. ktr = 0 for iyear in range(2002,2020): # --- loop over the month in question and the surrounding 2 months for cmo in [cmonth, cmonths_early[imonth], cmonths_late[imonth]]: imo = int(cmo)-1 if iyear%4 == 0: ndays = ndaysomo_leap[imo] else: ndays = ndaysomo[imo] cyear = str(iyear) infile = master_directory + cyear + cmo + \ '_ccpa_on_ndfd_grid_6hourly.nc' print (infile) nc = Dataset(infile) yyyymmddhh_end = nc.variables['yyyymmddhh_end'][:] for iday in range(1,ndays+1): if iday < 10: cday = '0'+str(iday) else: cday = str(iday) iyyyymmddhh = int(str(iyear)+cmo+cday+cend_hour) idx = np.where(yyyymmddhh_end == iyyyymmddhh)[0] precip_in = np.squeeze(nc.variables['apcp_anal'][idx,:,:]) if iyear == 2002 and iday == 1 and cmo == cmonth: nlats_ndfd, nlons_ndfd = np.shape(precip_in) precip_tseries = np.zeros((nsamps,nlats_ndfd,nlons_ndfd), \ dtype=np.float64) missingv = -99.99*np.ones((nlats_ndfd, nlons_ndfd), \ dtype=np.float64) lons = nc.variables['lons'][:,:] lats = nc.variables['lats'][:,:] precip_in = np.where(precip_in < 500., precip_in, missingv) precip_tseries[ktr,:,:] = precip_in[:,:] ktr = ktr+1 nc.close() # ---- loop over the grid points and estimate the spline coefficients now = datetime.now() begin_time = now.strftime("%H:%M:%S") spline_info = np.zeros((17,2,nlats_ndfd,nlons_ndfd), dtype=np.float64) spline_info_inv = np.zeros((17,2,nlats_ndfd,nlons_ndfd), dtype=np.float64) usegamma = np.zeros((nlats_ndfd,nlons_ndfd), dtype=np.int32) # flag for whether to use Gamma fit (1) indices_to_query = np.zeros((nlats_ndfd,nlons_ndfd,9), dtype=np.float16) Dnstat = 0.10*np.ones((nlats_ndfd, nlons_ndfd), dtype=np.float) fzero = np.zeros((nlats_ndfd, nlons_ndfd), dtype=np.float) cdf_at_indices = np.asarray([ 0.1, 0.25, 0.33333, 0.5, 0.65, 0.8, 0.85, 0.9, 0.95]) # these are interior knots where to calculate the cum hazard fn. print ('******** COMPUTING SPLINE COEFFICIENTS (wetter) or GAMMA PARAMETERS (dry) *********') #for jy in range(0,nlats_ndfd,nstride): for jy in range(498,499): now = datetime.now() current_time = now.strftime("%H:%M:%S") if jy%10 == 0: print (' ***** begin time, current time, jy, nlats_ndfd, lat = ',\ begin_time, current_time, jy, nlats_ndfd, lats[jy,0]) #for ix in range(0,nlons_ndfd, nstride): for ix in range (521,522): # ---- there is a grib round-off error that can give negative # values slightly smaller than teeny precip. to make sure # that we don't have either negative values or lots of the # same tiny values, subtractoff teeny_precip if pflag == True: print ('******* ',jy,ix) precip_ens_1d = precip_tseries[:,jy,ix] tp = np.min( [np.abs(np.min(precip_ens_1d)), 0.0] ) teeny_precip = tp*np.ones(nsamps) # make sure that samples >= 0.0 precip_ens_1d = precip_ens_1d - teeny_precip[:] if pflag == True: print ('min, max precip_ens_1d = ',\ np.min(precip_ens_1d), np.max(precip_ens_1d)) # take this grid point's sample, calc fraction_zero, and return # the number of nonzero (nz) samples and their sorted values fraction_zero, precip_ens_nonzero, nz = \ fraczero_possamps(nsamps, precip_ens_1d) # return sorted if pflag == True and nz > 0 : print (' precip_ens_nonzero[0:-1] = ',\ precip_ens_nonzero[0:-1]) if pflag == True: print (' nz = ',nz) if nz > 40: # ---- spline fit the CDF to the precipitation values via # Michael Scheuerer's hazard function (see Fig 3 in # https://doi.org/10.1175/MWR-D-20-0096.1. ) fzero[jy,ix] = fraction_zero empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz # ---- set where spline knots are. query_these_indices = [ nz//10, nz//4, nz//3, nz//2, (3*nz)//5, \ (4*nz)//5, (17*nz)//20, (9*nz)//10, (19*nz)//20] indices_to_query[jy,ix,:] = query_these_indices[:] empirical_precipvals = precip_ens_nonzero[query_these_indices] hazard_function_empirical = -np.log(1.0-empirical_cdf) # ---- compute spline coefficients/ spltemp = splrep(precip_ens_nonzero, hazard_function_empirical, \ xb=0., task=-1, t = empirical_precipvals) # ---- compute the fitted CDF spline_hazard = splev(precip_ens_nonzero, spltemp) spline_cdf = 1.0 - np.exp(-spline_hazard) # ---- save spline information to numpy array, spline_info[:,0,jy,ix] = spltemp[0] spline_info[:,1,jy,ix] = spltemp[1] # ---- in the subsequent quantile mapping, we will want the # analyzed precipitation amount given the quantile. # Accordingly, let's also reverse the data in the spline # and get the spline fits of precipitation amount # (y) to the cdf (x). hazard_function_at_indices = -np.log(1.0-cdf_at_indices) spltemp_inv = splrep(hazard_function_empirical, precip_ens_nonzero, \ xb=0., task=-1, t = cdf_at_indices) spline_info_inv[:,0,jy,ix] = spltemp_inv[0] spline_info_inv[:,1,jy,ix] = spltemp_inv[1] # --- evaluate Dn statistic, goodness of fit. diff = np.abs(empirical_cdf - spline_cdf) Dnstat[jy,ix] = np.max(diff) usegamma[jy,ix] = 0 elif nz < 10: # --- over the whole training sample, there were not enough # nonzero precip values recorded. Flag this as missing data. spline_info_inv[:,0,jy,ix] = -99.99 spline_info_inv[:,1,jy,ix] = -99.99 Dnstat[jy,ix] = -99.99 fzero[jy,ix] = -99.99 indices_to_query[jy,ix,:] = -99 usegamma[jy,ix] = 0 else: # ---- too few samples to use splines; fit a single-parameter Gamma # using Thom (1958) estimator described in Wilks textbook, # Statistical Methods in the Atmospheric Sciences. empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz fzero[jy,ix] = fraction_zero pmean = np.mean(precip_ens_nonzero) lnxbar = np.log(pmean) if pflag == True: print ('pmean, lnxbar = ', pmean, lnxbar) meanlnxi = np.mean(np.log(precip_ens_nonzero)) if pflag == True: print ('meanlnxi = ', meanlnxi) D = lnxbar - meanlnxi alpha_hat = (1.0 + np.sqrt(1.0+4.0*D/3.0)) / (4.0*D) beta_hat = pmean / alpha_hat if pflag == True: print('alpha_hat, beta_hat = ', alpha_hat, beta_hat) indices_to_query[jy,ix,:] = -1 # flag for using Gamma spline_info_inv[:,0,jy,ix] = alpha_hat # smoosh into the spline array spline_info_inv[:,1,jy,ix] = beta_hat # smoosh into the spline array usegamma[jy,ix] = 1 # --- evaluate Dn statistic, goodness of fit. y0 = precip_ens_nonzero / beta_hat fitted_CDF = stats.gamma.cdf(y0, alpha_hat) diff = np.abs(empirical_cdf - fitted_CDF) Dnstat[jy,ix] = np.max(diff) sys.exit() # --- save to cPickle file #outfile = master_directory_out + cmonth+'_'+cdomain+\ # '_CCPA_spline_info_h' + cend_hour + '.cPick' #print ('writing to ', outfile) #ouf = open(outfile, 'wb') #cPickle.dump(spline_info, ouf) #cPickle.dump(spline_info_inv, ouf) #cPickle.dump(fzero, ouf) #cPickle.dump(usegamma, ouf) #cPickle.dump(indices_to_query, ouf) #ouf.close() outfile = master_directory_out + cmonth+'_'+cdomain+\ '_CCPA_Dnstat_h' + cend_hour + '.cPick' print ('writing to ', outfile) ouf = open(outfile, 'wb') cPickle.dump(Dnstat, ouf) cPickle.dump(lons, ouf) cPickle.dump(lats, ouf) ouf.close() # ---- save to netCDF file outfile = master_directory_out + cmonth+'_'+cdomain+\ '_CCPA_spline_info_h' + cend_hour + 'UTC.nc' print ('writing to ',outfile) ncout = Dataset(outfile,'w',format='NETCDF4_CLASSIC') xf = ncout.createDimension('xf',nlons_ndfd) xvf = ncout.createVariable('xf','i4',('xf',)) xvf.long_name = "eastward grid point number on NDFD grid" xvf.units = "n/a" yf = ncout.createDimension('yf',nlats_ndfd) yvf = ncout.createVariable('yf','i4',('yf',)) yvf.long_name = "northward grid point number on NDFD grid" yvf.units = "n/a" xspd = ncout.createDimension('xspd',17) xspdf = ncout.createVariable('xspd','i4',('xspd',)) xspdf.long_name = "index for spline dimension" xspdf.units = "n/a" x2 = ncout.createDimension('x2',2) x2f = ncout.createVariable('x2','i4',('x2',)) x2f.long_name = "2nd index for spline dimension" x2f.units = "n/a" lonsa = ncout.createVariable('lons','f4',('yf','xf',)) lonsa.long_name = "longitude" lonsa.units = "degrees_east" latsa = ncout.createVariable('lats','f4',('yf','xf',)) latsa.long_name = "latitude" latsa.units = "degrees_north" spline_info_inv_out = ncout.createVariable('spline_info_inv',\ 'f4',('xspd','x2','yf','xf',), zlib=True,least_significant_digit=6) spline_info_inv_out.units = "n/a" spline_info_inv_out.long_name = \ "Information for computing precipitation from"+\ "spline inverse (or Gamma CDF for dry points)" spline_info_inv_out.missing_value = \ np.array(-9999.99,dtype=np.float32) fzero_out = ncout.createVariable('fzero','f4',('yf','xf',), zlib=True,least_significant_digit=6) fzero_out.units = "n/a" fzero_out.long_name = "fraction_zero" fzero_out.missing_value = np.array(-9999.99,dtype=np.float32) usegamma_out = ncout.createVariable('usegamma',\ 'i4',('yf','xf',), zlib=True) usegamma_out.units = "n/a" usegamma_out.long_name = "1 if fit CDF via Gamma distribution, 0 if not" usegamma_out.missing_value = np.array(-99,dtype=np.int32) # ---- metadata ncout.title = "NDFD domain spline inverse coefficients / Gamma parameters " ncout.history = "from CDF fitting code by Tom Hamill, PSL" ncout.institution = "psl.noaa.gov" ncout.platform = "n/a" ncout.references = "n/a" # ---- copy the outputs to netCDF structures. xvf[:] = range(nlons_ndfd) yvf[:] = range(nlats_ndfd) xspdf[:] = range(17) x2f[:] = range(2) lonsa[:] = lons[:,:] latsa[:] = lats[:,:] spline_info_inv_out[:] = spline_info_inv[:,:,:,:] fzero_out[:] = fzero[:,:] usegamma_out[:] = usegamma[:,:] ncout.close() istat = 0 return istat