def CDF_spline_fitting_forecast_precip_mean(cmonth, clead, cdomain):

    """
    CDF_spline_fitting_forecast_precip_mean.py cmonth clead cdomain

    Fit spline to CDF and save info using cumulative hazard function.

    Designed by Tom Hamill, NOAA, with help from Michael Scheuerer, Dec 2020

    """

    import os, sys
    from datetime import datetime
    import numpy as np
    import _pickle as cPickle
    from netCDF4 import Dataset
    import _pickle as cPickle
    import scipy.stats as stats
    from scipy.interpolate import LSQUnivariateSpline, splrep, splev
    import matplotlib.pyplot as plt
    from matplotlib import rcParams
    from mpl_toolkits.basemap import Basemap, interp
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    # =====================================================================

    def set_domain_boundaries(cdomain):
    
        """ used grib file of 2.5-km blend output grid to determine bounding 
            lat and lon, and from that, the domain bounding indices for the 
            0.25 GEFSv12 reforecast data that will encompass the domain.    
        """
        if cdomain == 'conus': 
            jmin = 93
            jmax = 246
            imin = 368
            imax = 686
        elif cdomain == 'pr':
            jmin = 243
            jmax = 256
            imin = 649
            imax = 667   
        elif cdomain == 'ak':
            jmin = 19
            jmax = 161
            imin = 201
            imax = 967
        else:
            print ('invalid domain.  Exiting.')     
            sys.exit()    
 
        return jmin, jmax, imin, imax

    # =====================================================================

    def get_surrounding_months(cmonth):

        if cmonth == 'Jan':
            cmonth_early = 'Dec'
            cmonth_late = 'Feb'
        elif cmonth == 'Feb':
            cmonth_early = 'Jan'
            cmonth_late = 'Mar'
        elif cmonth == 'Mar':
            cmonth_early = 'Feb'
            cmonth_late = 'Apr'
        elif cmonth == 'Apr':
            cmonth_early = 'Mar'
            cmonth_late = 'May'
        elif cmonth == 'May':
            cmonth_early = 'Apr'
            cmonth_late = 'Jun'
        elif cmonth == 'Jun':
            cmonth_early = 'May'
            cmonth_late = 'Jul'
        elif cmonth == 'Jul':
            cmonth_early = 'Jun'
            cmonth_late = 'Aug'
        elif cmonth == 'Aug':
            cmonth_early = 'Jul'
            cmonth_late = 'Sep'
        elif cmonth == 'Sep':
            cmonth_early = 'Aug'
            cmonth_late = 'Oct'
        elif cmonth == 'Oct':
            cmonth_early = 'Sep'
            cmonth_late = 'Nov'
        elif cmonth == 'Nov':
            cmonth_early = 'Oct'
            cmonth_late = 'Dec'
        elif cmonth == 'Dec':
            cmonth_early = 'Nov'
            cmonth_late = 'Jan'
        else:
            print ('invalid month')
            sys.exit()
    
        return cmonth_early, cmonth_late

    # =====================================================================

    def fraczero_possamps(nsamps, precip_ens):
    
        """
    
        from the vector input sample precip_ens, define the fraction of
        samples with zero precipitation.   For the positive samples, add
        a small random number to deal with the fact that the data was 
        discretized to ~0.1 mm, so that when later creating CDFs we don't 
        have empirical values with lots of tied amounts.  Also, sort the 
        nonzero amounts and return.
    
        """
        number_zeros = 0
    
        # data discretized, so add random component of this magnitude
        #precip_ens = np.where(precip_ens < 4.0, precip_ens* \
        #    np.random.uniform(low=-0.5,high=1.5,size=len(precip_ens)), precip_ens)
    
        precip_ens_nonzero = np.delete(precip_ens, \
            np.where(precip_ens <= 0.0))  # censor at 0.0 mm
        precip_ens_nonzero = precip_ens_nonzero + \
            np.random.uniform(low=-0.1,high=0.1,size=len(precip_ens_nonzero))
        precip_ens_nonzero = np.delete(precip_ens_nonzero, \
            np.where(precip_ens_nonzero <= 0.0))  # censor at 0.0 mm
        nz = len(precip_ens_nonzero)    
        
        #precip_ens_nonzero = precip_ens_nonzero + \
        #    np.random.uniform(low=-0.005,high=0.005,size=nz) 
        precip_ens_nonzero = np.sort(precip_ens_nonzero)  
        ntotal = len(precip_ens)
        nzero = ntotal - len(precip_ens_nonzero)
        fraction_zero = float(nzero) / float(ntotal)
        return fraction_zero, precip_ens_nonzero, nz

    # =====================================================================
    
    # ---- inputs from command line

    #cmonth = sys.argv[1] # 'Jan', 'Feb', etc.
    #clead = sys.argv[2] # 03, 06, 12, etc.
    #cdomain = sys.argv[3] # conus, etc.
    cmonth_list = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    cmonthnum = ['01','02','03','03','04','05','06','07','08','09','10','11','12']
    print ('cmonth = ', cmonth)
    print (cmonth_list)
    imonth_index = cmonth_list.index(cmonth)

    # ---- set parameters

    jmin, jmax, imin, imax = set_domain_boundaries(cdomain)
    nstride = 1
    cmonth_early, cmonth_late = get_surrounding_months(cmonth)
    pflag = False # for print statements
    master_directory = '/Volumes/NBM/'+cdomain+'_gefsv12/precip/netcdf/'
    master_directory_out = '/Volumes/NBM/'+cdomain+'_gefsv12/CDF_spline/'
    nmembers = 5

    # ---- read in the previously generated netCDF file with precipitation
    #      for this month and lead time.  All members, dates for this 
    #      month have been smushed into one leading index, dimension
    #      nsamps, since the date of the forecast within the month and 
    #      the member number is irrelevant for the distribution fitting.
   
    ncfile = master_directory + cmonth + '_apcp_sfc_h' + clead + '.nc'
    print (ncfile)
    nc = Dataset(ncfile)
    precip_middle = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax]
    nsamps_middle, ny_gefsv12, nx_gefsv12 = np.shape(precip_middle)
    lons_1d = nc.variables['lons_fcst'][imin:imax]
    lats_1d = nc.variables['lats_fcst'][jmin:jmax]
    nc.close()

    ncfile = master_directory + cmonth_early + '_apcp_sfc_h' + clead + '.nc'
    print (ncfile)
    nc = Dataset(ncfile)
    precip_early = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax]
    nsamps_early, ny_gefsv12, nx_gefsv12 = np.shape(precip_early)
    nc.close()

    ncfile = master_directory + cmonth_late + '_apcp_sfc_h' + clead + '.nc'
    print (ncfile)
    nc = Dataset(ncfile)
    precip_late = nc.variables['apcp_fcst'][:,jmin:jmax,imin:imax]
    nsamps_late, ny_gefsv12, nx_gefsv12 = np.shape(precip_late)
    nc.close()

    # total samples of all members. 
    nsamps = nsamps_middle + nsamps_early + nsamps_late
    nsamps_mean = nsamps//5
    
    
    # ---- the first dimension of precip arrays are cases x members.  
    #      average over all the members
    
    precip_middle_mean = np.zeros((nsamps_middle//5, ny_gefsv12, nx_gefsv12), dtype=np.float64)
    for i in range(0, nsamps_middle,5):
        precip_middle_mean[i//5,:,:] = np.mean(precip_middle[i:i+5,:,:], axis=0)
    precip_early_mean = np.zeros((nsamps_early//5, ny_gefsv12, nx_gefsv12), dtype=np.float64)
    for i in range(0, nsamps_early,5):
        precip_early_mean[i//5,:,:] = np.mean(precip_early[i:i+5,:,:], axis=0)    
    precip_late_mean = np.zeros((nsamps_late//5, ny_gefsv12, nx_gefsv12), dtype=np.float64)
    for i in range(0, nsamps_late,5):
        precip_late_mean[i//5,:,:] = np.mean(precip_late[i:i+5,:,:], axis=0)    

    # ---- more initialization of output storage arrays now that 
    #      we know the array dimensions

    fzero = np.zeros((ny_gefsv12, nx_gefsv12), dtype=np.float)
    Dnstat = 0.10*np.ones((ny_gefsv12, nx_gefsv12), dtype=np.float)
   
    # ---- loop over the grid points and estimate the Gamma distributions
    #      for each parameter.  First see if a single Gamma distribution
    #      is appropriate; if not, try a mixture of two.   If that still
    #      doesn't fit well, try a mixture of three.   

    now = datetime.now()
    begin_time = now.strftime("%H:%M:%S")
    plotit = False
 
    spline_info_fcst = np.zeros((17,2,ny_gefsv12,nx_gefsv12), dtype=np.float64) 
    usegamma_fcst = np.zeros((ny_gefsv12, nx_gefsv12), dtype=np.int32)
    #spline_info = np.zeros((ny_gefsv12,nx_gefsv12,2,9), dtype=np.float64) 
    indices_to_query_fcst = np.zeros((9,ny_gefsv12,nx_gefsv12), dtype=np.float16)
    empirical_precipvals_fcst = np.zeros((9,ny_gefsv12,nx_gefsv12), dtype=np.float16)
                   
    for jy in range(0,ny_gefsv12,nstride):
    #for jy in range(ny_gefsv12//2, ny_gefsv12//2+1):
        
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        if jy%20 == 0: print ('***** begin time, current time, jy, ny_gefsv12, lat = ',\
            begin_time, current_time, jy, ny_gefsv12, lats_1d[jy])        
        
        for ix in range(0,nx_gefsv12,nstride):
        #for ix in range (nx_gefsv12//2, nx_gefsv12//2+1):
            
            # ---- there is a grib round-off error that can give negative
            #      values slightly smaller than teeny precip.  to make sure 
            #      that we don't have either negative values or lots of the 
            #      same tiny values, subtractoff teeny_precip
        
            precip_ens_1d_middle = precip_middle_mean[:,jy,ix]
            precip_ens_1d_early = precip_early_mean[:,jy,ix]
            precip_ens_1d_late = precip_late_mean[:,jy,ix]
            precip_ens_1d = np.concatenate((precip_ens_1d_middle, \
                precip_ens_1d_early, precip_ens_1d_late))
        
            tp = np.min( [np.abs(np.min(precip_ens_1d)), 0.0] )
            teeny_precip = tp*np.ones(nsamps_mean)
            precip_ens_1d = precip_ens_1d - teeny_precip[:]

            fraction_zero, precip_ens_nonzero, nz = \
                fraczero_possamps(nsamps_mean, precip_ens_1d) # return sorted
            fzero[jy,ix] = fraction_zero 
        
            if nz > 40:

                # ---- spline fit with the focus on knots at higher quantiles

                query_these_indices = [ nz//10, nz//4, nz//2, (3*nz)//5, \
                    (4*nz)//5, (17*nz)//20, (9*nz)//10, (19*nz)//20, (39*nz)//40]
                indices_to_query_fcst[:,jy,ix] = query_these_indices[:]
                empirical_precipvals = precip_ens_nonzero[query_these_indices]
                empirical_precipvals_fcst[:,jy,ix] = empirical_precipvals[:]
                #print ('empirical_precipvals = ', empirical_precipvals)
        
                empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz      
                hazard_function_empirical = -np.log(1.0-empirical_cdf)
                spltemp = splrep(precip_ens_nonzero, hazard_function_empirical, \
                    xb=0., task=-1, t = empirical_precipvals)   
                spline_hazard = splev(precip_ens_nonzero, spltemp)
                spline_cdf = 1.0 - np.exp(-spline_hazard)       
                diff = np.abs(empirical_cdf - spline_cdf)
        
                # ---- save spline information to numpy array
        
                spline_info_fcst[:,0,jy,ix] = spltemp[0]
                spline_info_fcst[:,1,jy,ix] = spltemp[1]
                Dnstat[jy,ix] = np.max(diff)
        
            elif nz < 10:
            
                # --- over the whole training sample, there were not enough 
                #     nonzero precip values recorded.   Flag this as missing data.
            
                spline_info_fcst[:,0,jy,ix] = -99.99
                spline_info_fcst[:,1,jy,ix] = -99.99
                indices_to_query_fcst[:,jy,ix] = -99 
                usegamma_fcst[jy,ix] = 0
                Dnstat[jy,ix] = 0.0      

            else:
            
                # ---- too few samples to use splines; fit a single-parameter Gamma 
                #      using Thom (1958) estimator described in Wilks textbook,
                #      Statistical Methods in the Atmospheric Sciences.
            
                empirical_cdf = 1.0/(2.0*nz) + np.arange(nz)/nz

                pmean = np.mean(precip_ens_nonzero)
                lnxbar = np.log(pmean)
                if pflag == True: print ('pmean, lnxbar = ', pmean, lnxbar)
                meanlnxi = np.mean(np.log(precip_ens_nonzero))
                if pflag == True: print ('meanlnxi = ', meanlnxi)
                D = lnxbar - meanlnxi
                alpha_hat = (1.0 + np.sqrt(1.0+4.0*D/3.0)) / (4.0*D)
                beta_hat = pmean / alpha_hat
                if pflag == True: print('alpha_hat, beta_hat = ', alpha_hat, beta_hat)
                indices_to_query_fcst[:,jy,ix] = -1 # flag for using Gamma
                spline_info_fcst[:,0,jy,ix] = alpha_hat  # smoosh into the spline array
                spline_info_fcst[:,1,jy,ix] = beta_hat # smoosh into the spline array
                usegamma_fcst[jy,ix] = 1
            
                # --- evaluate Dn statistic, goodness of fit.
            
                y0 = precip_ens_nonzero / beta_hat
                fitted_CDF = stats.gamma.cdf(y0, alpha_hat)
                diff = np.abs(empirical_cdf - fitted_CDF)
                Dnstat[jy,ix] = np.max(diff)
        

            if plotit == True:
            
                # ---- plot the CDFs, forecast and analyzed, empirical and best fitted.

                f = plt.figure(figsize=(6.5,6.5))#
                ax = f.add_axes([.13,.14,.84,.75])
                title = 'Spline fit of hazard function, jy, ix = '+str(jy)+','+str(ix)
                ax.set_title(title,fontsize=13)
                ax.plot(precip_ens_nonzero, spline_hazard,color='Blue',\
                    lw=2,label='Cubic spline')
                ax.plot(precip_ens_nonzero, hazard_function_empirical,\
                    color='Red',lw=2,label='Empirical')
                plt.ylabel('Hazard function',fontsize=11)
                ax.legend(loc=0)
                ax.set_ylim(-0.2,6.0)
                plt.grid(True,lw=0.25,color='LightGray')
                ax.set_xlim(0,40)
                #ax.set_xlim(0,5)
                ax.set_xlabel('6-hourly total precipitation (mm)',fontsize=11)
                figname = 'hazard_function_precip_example_spline'+str(jy)+'.png'
                plt.savefig(figname,dpi=400)
                print ('Plot done', figname)
                plt.close()

            
                f = plt.figure(figsize=(6.5,6.5))#
                ax = f.add_axes([.13,.14,.84,.75])
                title = 'Spline fit of CDF via hazard function, jy, ix = '+\
                    str(jy)+','+str(ix)
                ax.set_title(title,fontsize=13)
                ax.plot(precip_ens_nonzero, spline_cdf,color='Blue',\
                    lw=2,label='Cubic spline')
                ax.plot(precip_ens_nonzero, empirical_cdf,color='Red',\
                    lw=2,label='Empirical')
                plt.ylabel('Non-exceedance probability',fontsize=11)
                ax.legend(loc=0)
                ax.set_ylim(0.,1)
                plt.grid(True,lw=0.25,color='LightGray')
                ax.set_xlim(0,40)
                #ax.set_xlim(0,5)
                ax.set_xlabel('6-hourly total precipitation (mm)',fontsize=11)
                figname = 'CDF_precip_example_spline'+str(jy)+'.png'
                plt.savefig(figname,dpi=400)
                print ('Plot done', figname)
                plt.close() 
                #sys.exit()

           
    # --- save to cPickle files

    #outfile = master_directory + cmonth+'_'+cdomain+\
    #    '_GEFSv12_spline_info_h' + clead + '.cPick'
    #print ('writing to ', outfile)
    #ouf = open(outfile, 'wb')
    #cPickle.dump(spline_info, ouf)
    #cPickle.dump(fzero, ouf)
    #cPickle.dump(indices_to_query, ouf)
    #cPickle.dump(empirical_precipvals_array, ouf)
    #ouf.close()

    print ('max Dnstat = ', np.max(Dnstat))
    outfile = master_directory_out + cmonthnum[imonth_index]+'_'+cdomain+\
        '_GEFSv12_mean_Dnstat_h' + clead + '.cPick'
    print ('writing to ', outfile)
    ouf = open(outfile, 'wb')
    cPickle.dump(Dnstat, ouf)
    cPickle.dump(lons_1d, ouf)
    cPickle.dump(lats_1d, ouf)
    ouf.close()

    # ---- save to netCDF file

    outfile = master_directory_out + cmonthnum[imonth_index]+'_'+cdomain+\
        '_GEFSv12_mean_spline_info_h' + clead + '.nc' 
    print ('writing to ',outfile)
    ncout = Dataset(outfile,'w',format='NETCDF4_CLASSIC')

    xf = ncout.createDimension('xf',nx_gefsv12)
    xvf = ncout.createVariable('xf','i4',('xf',))
    xvf.long_name = "eastward grid point number on NDFD grid"
    xvf.units = "n/a"

    yf = ncout.createDimension('yf',ny_gefsv12)
    yvf = ncout.createVariable('yf','i4',('yf',))
    yvf.long_name = "northward grid point number on NDFD grid"
    yvf.units = "n/a"

    xspd = ncout.createDimension('xspd',17)
    xspdf = ncout.createVariable('xspd','i4',('xspd',))
    xspdf.long_name = "index for spline dimension"
    xspdf.units = "n/a"

    x2 = ncout.createDimension('x2',2)
    x2f = ncout.createVariable('x2','i4',('x2',))
    x2f.long_name = "2nd index for spline dimension"
    x2f.units = "n/a"

    x9 = ncout.createDimension('x9',9)
    x9f = ncout.createVariable('x9','i4',('x9',))
    x9f.long_name = "first dimension of nonzero_indices_of_knots"
    x9f.units = "n/a"

    lonsa = ncout.createVariable('lons','f4',('xf',))
    lonsa.long_name = "longitude"
    lonsa.units = "degrees_east"

    latsa = ncout.createVariable('lats','f4',('yf',))
    latsa.long_name = "latitude"
    latsa.units = "degrees_north"

    spline_info_out = ncout.createVariable('spline_info',\
        'f4',('xspd','x2','yf','xf',),
        zlib=True,least_significant_digit=6)
    spline_info_out.units = "n/a"
    spline_info_out.long_name = \
        "Information for computing precipitation from"+\
        "spline (or Gamma CDF for dry points)"
    spline_info_out.missing_value = \
        np.array(-9999.99,dtype=np.float32)
    
    indices_to_query_out = ncout.createVariable('nonzero_indices_of_knots',\
        'i4',('x9','yf','xf',),
        zlib=True,least_significant_digit=6)
    indices_to_query_out.units = "n/a"
    indices_to_query_out.long_name = \
        "non-zero samples, indices of knots"
    indices_to_query_out.missing_value = \
        np.array(-99,dtype=np.int32)    

    fzero_out = ncout.createVariable('fzero','f4',('yf','xf',),
        zlib=True,least_significant_digit=6)
    fzero_out.units = "n/a"
    fzero_out.long_name = "fraction_zero"
    fzero_out.missing_value = np.array(-9999.99,dtype=np.float32)

    usegamma_out = ncout.createVariable('usegamma',\
        'i4',('yf','xf',), zlib=True)
    usegamma_out.units = "n/a"
    usegamma_out.long_name = "1 if fit CDF via Gamma distribution, 0 if not"
    usegamma_out.missing_value = np.array(-99,dtype=np.int32)

    # ---- metadata

    ncout.title = "NDFD domain spline inverse coefficients / Gamma parameters for 5-mem mean"
    ncout.history = "from CDF fitting code by Tom Hamill, PSL"
    ncout.institution =  "psl.noaa.gov"
    ncout.platform = "n/a"
    ncout.references = "n/a"
    
    # ---- copy the outputs to netCDF structures.

    xvf[:] = range(nx_gefsv12)
    yvf[:] = range(ny_gefsv12)
    xspdf[:] = range(17)
    x2f[:] = range(2)
    x9f[:] = range(9)
    lonsa[:] = lons_1d[:]
    latsa[:] = lats_1d[:]
    spline_info_out[:] = spline_info_fcst[:,:,:,:]
    fzero_out[:] = fzero[:,:]
    usegamma_out[:] = usegamma_fcst[:,:]
    indices_to_query_out[:] = indices_to_query_fcst[:,:,:] 

    ncout.close()