def CDF_fitting_ccpa_precip_spline_v2(cmonth, cend_hour): """ CDF_fitting_ccpa_precip_spline.py cmonth cend_hour where cmonth = '01' to '12' and cend_hour is '00','06','12', or '18' this python script is designed to spline fit an empirical CDF of 6-h accumulated precipitation. The script is tailored to merged CCPA/MSWEP precipitation analyses over the National Digital Forecast Database CONUS domain for the National Blend of Models Designed by Tom Hamill, NOAA, with help from Michael Scheuerer, Jan 2021 """ import os, sys from datetime import datetime import numpy as np import _pickle as cPickle from netCDF4 import Dataset import _pickle as cPickle import scipy.stats as stats import scipy.stats as stats from scipy.interpolate import splrep, splev # ===================================================================== def fraczero_possamps(nsamps, precip_samples): """ from the vector input sample precip_samples, define the fraction of samples with effectively zero precipitation. Add a small random number to deal with the fact that the data was discretized to ~0.1 mm, so that when later creating CDFs we don't have empirical values with lots of tied amounts. Also, sort the nonzero amounts and return. """ precip_samples_nonzero = np.delete(precip_samples, \ np.where(precip_samples <= 0.0)) precip_samples_nonzero = precip_samples_nonzero + \ np.random.uniform(low=-0.01,high=0.01,size=len(precip_samples_nonzero)) precip_samples_nonzero = np.delete(precip_samples_nonzero, \ np.where(precip_samples_nonzero <= 0.0)) # censor at 0.0 mm nz = len(precip_samples_nonzero) # number non-zero precip_samples_nonzero = np.sort(precip_samples_nonzero) ntotal = len(precip_samples) nzero = ntotal - len(precip_samples_nonzero) fraction_zero = float(nzero) / float(ntotal) return fraction_zero, precip_samples_nonzero, nz # ===================================================================== # ---- set directories, constants print ('cmonth, cend_hour = ', cmonth, cend_hour) imonth = int(cmonth) - 1 nstride = 1 # do every point cdomain = 'conus' pflag = False # True # for print statements #master_directory = '/Volumes/Backup Plus/ccpa/' master_directory = '/Volumes/NBM/'+cdomain+'_panal/' master_directory_out = '/Volumes/NBM/'+cdomain+'_panal/CDF_spline/' ndaysomo = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] ndaysomo_leap = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] cmonths_early = ['12','01','02','03','04','05','06','07','08','09','10','11'] cmonths_late = ['02','03','04','05','06','07','08','09','10','11','12','01'] cmonths = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', \ 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] yearstart = 2002 # CCPA only available starting 2002 yearend = 2020 # companion reforecasts end at end of 2019 # ---- determine the overall number of daily precipitation # samples across all years for this month and the surrounding # two months iearly = int(cmonths_early[imonth])-1 ilate = int(cmonths_late[imonth])-1 if imonth != 1: # not Feb nsamps_mid = ndaysomo[imonth]*18 else: nsamps_mid = 4*ndaysomo_leap[imonth] + 14*ndaysomo[imonth] if iearly != 1: # not Feb nsamps_early = ndaysomo[iearly]*20 else: nsamps_early = 4*ndaysomo_leap[iearly] + 14*ndaysomo[iearly] if ilate != 1: # not Feb nsamps_late = ndaysomo[ilate]*20 else: nsamps_late = 4*ndaysomo_leap[ilate] + 14*ndaysomo[ilate] nsamps = nsamps_mid + nsamps_early + nsamps_late print ('nsamps = ', nsamps) # ---- read in the previously generated netCDF file with precipitation # for this month and lead time as well as the surrounding # two months. All dates for this month have # been smushed into one leading index, dimension nsamps, # since the date of the forecast within the month and # the member number is irrelevant for the distribution # fitting. ktr = 0 for iyear in range(yearstart, yearend): # --- loop over the month in question and the surrounding 2 months for cmo in [cmonth, cmonths_early[imonth], cmonths_late[imonth]]: imo = int(cmo)-1 if iyear%4 == 0: ndays = ndaysomo_leap[imo] else: ndays = ndaysomo[imo] cyear = str(iyear) infile = master_directory + cyear + cmo + \ '_ccpa_on_ndfd_grid_6hourly.nc' print (infile) nc = Dataset(infile) yyyymmddhh_end = nc.variables['yyyymmddhh_end'][:] for iday in range(1,ndays+1): if iday < 10: cday = '0'+str(iday) else: cday = str(iday) iyyyymmddhh = int(str(iyear)+cmo+cday+cend_hour) idx = np.where(yyyymmddhh_end == iyyyymmddhh)[0] precip_in = np.squeeze(nc.variables['apcp_anal'][idx,:,:]) if iyear == 2002 and iday == 1 and cmo == cmonth: nlats_ndfd, nlons_ndfd = np.shape(precip_in) precip_tseries = np.zeros((nsamps,nlats_ndfd,nlons_ndfd), \ dtype=np.float64) missingv = -99.99*np.ones((nlats_ndfd, nlons_ndfd), \ dtype=np.float64) lons = nc.variables['lons'][:,:] lats = nc.variables['lats'][:,:] precip_in = np.where(precip_in < 500., precip_in, missingv) precip_tseries[ktr,:,:] = precip_in[:,:] ktr = ktr+1 nc.close() #precip_samples_1d = precip_tseries[:,784,930] #fraction_zero, precip_samples_nonzero, nz = \ # fraczero_possamps(nsamps, precip_samples_1d) #print ('precip_samples_nonzero = ', precip_samples_nonzero[0:-1:10]) # ---- loop over the grid points and estimate the inverse spline coefficients now = datetime.now() begin_time = now.strftime("%H:%M:%S") spline_info = np.zeros((nlats_ndfd,nlons_ndfd,2,17), dtype=np.float64) spline_info_inv = np.zeros((nlats_ndfd,nlons_ndfd,2,17), dtype=np.float64) usegamma = np.zeros((nlats_ndfd,nlons_ndfd), dtype=np.int32) # flag for whether to use Gamma fit (1) or spline (0) or missing data (-1) fzero = np.zeros((nlats_ndfd, nlons_ndfd), dtype=np.float) #cdf_at_indices = np.asarray([ 0.1, 0.25, 0.33333, 0.5, 0.65, 0.8, 0.85, 0.93, 0.98]) # # these are interior knots where to calculate the cum hazard fn. cdf_at_indices = np.asarray([ 0.1, 0.25, 0.5, 0.6, 0.8, 0.85, 0.9, 0.96, 0.98]) # these are interior knots where to calculate the cum hazard fn. print ('******** COMPUTING SPLINE COEFFICIENTS (wetter) or GAMMA PARAMETERS (dry) *********') for jy in range(0, nlats_ndfd, nstride): #for jy in range(784,785): now = datetime.now() current_time = now.strftime("%H:%M:%S") if jy%10 == 0: print (' ***** begin time, current time, jy, nlats_ndfd, lat = ',\ begin_time, current_time, jy, nlats_ndfd, lats[jy,0]) for ix in range(0,nlons_ndfd, nstride): #for ix in range (930, 931): # ---- there is a grib round-off error that can give negative # values slightly smaller than teeny precip. to make sure # that we don't have either negative values or lots of the # same tiny values, subtractoff teeny_precip if jy == 784 and ix == 930: pflag = True else: pflag = False if pflag == True: print ('******* ',jy,ix) precip_samples_1d = precip_tseries[:,jy,ix] # take this grid point's sample, calc fraction_zero, and return # the number of nonzero (nz) samples and their sorted values fraction_zero, precip_samples_nonzero, nz = \ fraczero_possamps(nsamps, precip_samples_1d) # return sorted if pflag == True and nz > 0 : print (' precip_samples_nonzero[0:-1] = ',\ precip_samples_nonzero[0:-1]) if pflag == True: print (' nz = ',nz) if nz > 50: # ---- spline fit the CDF to the precipitation values via # Michael Scheuerer's hazard function (see Fig 3 in # https://doi.org/10.1175/MWR-D-20-0096.1. ) usegamma[jy,ix] = 0 # a flag to use spline inverse fzero[jy,ix] = fraction_zero empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz # ---- set where spline knots are. query_these_indices = [ nz//10, nz//4, nz//3, nz//2, (3*nz)//5, \ (4*nz)//5, (17*nz)//20, (9*nz)//10, (19*nz)//20] cdf_at_indices = np.asarray(\ [ 0.1, 0.25, 0.5, 0.6, 0.8, 0.85, 0.9, 0.96, 0.98]) #query_these_indices = [ nz//10, nz//4, nz//2, (3*nz)//5, \ # (4*nz)//5, (17*nz)//20, (9*nz)//10, (24*nz)//25, (49*nz)//50] if pflag == True: \ print (' query_these_indices = ',query_these_indices) empirical_precipvals = precip_samples_nonzero[query_these_indices] if pflag == True: \ print (' empirical_precipvals = ',empirical_precipvals) hazard_function_empirical = -np.log(1.0 - empirical_cdf) # ---- in the subsequent quantile mapping, we will want the # analyzed precipitation amount given the quantile. # Accordingly, let's reverse the data in the spline # and get the spline fits of precipitation amount # (y) to the cdf (x). hazard_function_at_indices = -np.log(1.0-cdf_at_indices) if pflag == True: print \ (' hazard_function_empirical = ',hazard_function_empirical) #spltemp_inv = splrep(hazard_function_empirical, precip_samples_nonzero, \ # xb=0., task=-1, t = cdf_at_indices) spltemp_inv = splrep(hazard_function_empirical, precip_samples_nonzero, \ xb=0., task=-1, t = hazard_function_at_indices) spline_info_inv[jy,ix,0,:] = spltemp_inv[0] if pflag == True: print \ (' spline_info_inv[jy,ix,0,:] = ',spline_info_inv[jy,ix,0,:]) spline_info_inv[jy,ix,1,:] = spltemp_inv[1] if pflag == True: print \ (' spline_info_inv[jy,ix,1,:] = ',spline_info_inv[jy,ix,1,:]) # ---- spline fit to anal CDF with the focus on knots at higher quantiles. # these data are saved not for quantile mapping but for diagnostics. spltemp = splrep(precip_samples_nonzero, hazard_function_empirical, \ xb=0., task=-1, t = empirical_precipvals) spline_info[jy,ix,0,:] = spltemp[0] spline_info[jy,ix,1,:] = spltemp[1] fitted_precip = splev(hazard_function_empirical, spltemp) if pflag == True: print ('fitted precip = ', fitted_precip) elif nz < 10: # --- over the whole training sample, there were not enough # nonzero precip values recorded. Flag this as missing data. usegamma[jy,ix] = -1 spline_info_inv[jy,ix,0,:] = -99.99 spline_info_inv[jy,ix,1,:] = -99.99 fzero[jy,ix] = -99.99 else: # ---- too few samples to use splines; fit a single-parameter Gamma # using Thom (1958) estimator described in Wilks textbook, # Statistical Methods in the Atmospheric Sciences. usegamma[jy,ix] = 1 empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz fzero[jy,ix] = fraction_zero pmean = np.mean(precip_samples_nonzero) lnxbar = np.log(pmean) if pflag == True: print ('pmean, lnxbar = ', pmean, lnxbar) meanlnxi = np.mean(np.log(precip_samples_nonzero)) if pflag == True: print ('meanlnxi = ', meanlnxi) D = lnxbar - meanlnxi alpha_hat = (1.0 + np.sqrt(1.0+4.0*D/3.0)) / (4.0*D) beta_hat = pmean / alpha_hat if pflag == True: print('alpha_hat, beta_hat = ', alpha_hat, beta_hat) spline_info_inv[jy,ix,0,:] = alpha_hat # smoosh into the spline array spline_info_inv[jy,ix,1,:] = beta_hat # smoosh into the spline array # ---- save to netCDF file outfile = master_directory_out + cmonths[imonth]+'_'+cdomain+\ '_CCPA_spline_info_h' + cend_hour + 'UTC.nc' print ('writing to ',outfile) ncout = Dataset(outfile,'w',format='NETCDF4_CLASSIC') xf = ncout.createDimension('xf',nlons_ndfd) xvf = ncout.createVariable('xf','i4',('xf',)) xvf.long_name = "eastward grid point number on NDFD grid" xvf.units = "n/a" yf = ncout.createDimension('yf',nlats_ndfd) yvf = ncout.createVariable('yf','i4',('yf',)) yvf.long_name = "northward grid point number on NDFD grid" yvf.units = "n/a" xspd = ncout.createDimension('xspd',17) xspdf = ncout.createVariable('xspd','i4',('xspd',)) xspdf.long_name = "index for spline dimension" xspdf.units = "n/a" x2 = ncout.createDimension('x2',2) x2f = ncout.createVariable('x2','i4',('x2',)) x2f.long_name = "2nd index for spline dimension" x2f.units = "n/a" lonsa = ncout.createVariable('lons','f4',('yf','xf',)) lonsa.long_name = "longitude" lonsa.units = "degrees_east" latsa = ncout.createVariable('lats','f4',('yf','xf',)) latsa.long_name = "latitude" latsa.units = "degrees_north" spline_info_inv_out = ncout.createVariable('spline_info_inv',\ 'f4',('yf','xf','x2','xspd'), zlib=True,least_significant_digit=6) spline_info_inv_out.units = "n/a" spline_info_inv_out.long_name = \ "Information for computing quantile-mapped precipitation from "+\ "spline inverse (or Gamma CDF for dry points). When given a "+\ "forecast quantile, this will predict the analyzed precipitation amt. "+\ "x2=0 is for knots, x2=1 for spline coefficients. Splines used "+\ "only if there are sufficient samples, > 40. If the sample size is "+\ "between 10 and 40, fit a Gamma distribution instead, and insert the "+\ "alpha and beta parameters into this variable, alpha in x2=0, "+\ "beta in x2=1. If less than 10 samples, don't try to do anything." spline_info_inv_out.missing_value = \ np.array(-9999.99,dtype=np.float32) spline_info_out = ncout.createVariable('spline_info',\ 'f4',('yf','xf','x2','xspd'), zlib=True,least_significant_digit=6) spline_info_out.units = "n/a" spline_info_out.long_name = \ "Information for computing precipitation from spline "+\ "Diagnostic, and only valid at points that are reasonably moist." spline_info_out.missing_value = \ np.array(-9999.99,dtype=np.float32) fzero_out = ncout.createVariable('fzero','f4',('yf','xf',), zlib=True,least_significant_digit=6) fzero_out.units = "n/a" fzero_out.long_name = "fraction_zero" fzero_out.missing_value = np.array(-9999.99,dtype=np.float32) usegamma_out = ncout.createVariable('usegamma',\ 'i4',('yf','xf',), zlib=True) usegamma_out.units = "n/a" usegamma_out.long_name = "1 if fit CDF via Gamma distribution, 0 if not" usegamma_out.missing_value = np.array(-99,dtype=np.int32) # ---- metadata ncout.title = "NDFD domain spline inverse coefficients / Gamma parameters " ncout.history = "from CDF fitting code by Tom Hamill, PSL" ncout.institution = "psl.noaa.gov" ncout.platform = "n/a" ncout.references = "n/a" # ---- copy the outputs to netCDF data structures. xvf[:] = range(nlons_ndfd) yvf[:] = range(nlats_ndfd) xspdf[:] = range(17) x2f[:] = range(2) lonsa[:] = lons[:,:] latsa[:] = lats[:,:] spline_info_inv_out[:] = spline_info_inv[:,:,:,:] spline_info_out[:] = spline_info[:,:,:,:] fzero_out[:] = fzero[:,:] usegamma_out[:] = usegamma[:,:] # ---- close the netCDF file ncout.close() istat = 0 return istat