matheus__serpa

Untitled

Aug 27th, 2020
1,359
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.02 KB | None | 0 0
  1. %matplotlib inline
  2.  
  3. import warnings
  4. import numpy as np
  5. import pandas as pd
  6. import scipy.stats as st
  7. import statsmodels as sm
  8. import matplotlib
  9. import matplotlib.pyplot as plt
  10.  
  11. matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
  12. matplotlib.style.use('ggplot')
  13.  
  14. # Create models from data
  15. def best_fit_distribution(data, bins=200, ax=None):
  16.     """Model data by finding best fit distribution to data"""
  17.     # Get histogram of original data
  18.     y, x = np.histogram(data, bins=bins, density=True)
  19.     x = (x + np.roll(x, -1))[:-1] / 2.0
  20.  
  21.     # Distributions to check
  22.     DISTRIBUTIONS = [        
  23.         st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
  24.         st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
  25.         st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
  26.         st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
  27.         st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
  28.         st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
  29.         st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
  30.         st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
  31.         st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
  32.         st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
  33.     ]
  34.  
  35.     # Best holders
  36.     best_distribution = st.norm
  37.     best_params = (0.0, 1.0)
  38.     best_sse = np.inf
  39.  
  40.     # Estimate distribution parameters from data
  41.     for distribution in DISTRIBUTIONS:
  42.  
  43.         # Try to fit the distribution
  44.         try:
  45.             # Ignore warnings from data that can't be fit
  46.             with warnings.catch_warnings():
  47.                 warnings.filterwarnings('ignore')
  48.  
  49.                 # fit dist to data
  50.                 params = distribution.fit(data)
  51.  
  52.                 # Separate parts of parameters
  53.                 arg = params[:-2]
  54.                 loc = params[-2]
  55.                 scale = params[-1]
  56.  
  57.                 # Calculate fitted PDF and error with fit in distribution
  58.                 pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
  59.                 sse = np.sum(np.power(y - pdf, 2.0))
  60.  
  61.                 # if axis pass in add to plot
  62.                 try:
  63.                     if ax:
  64.                         pd.Series(pdf, x).plot(ax=ax)
  65.                     end
  66.                 except Exception:
  67.                     pass
  68.  
  69.                 # identify if this distribution is better
  70.                 if best_sse > sse > 0:
  71.                     best_distribution = distribution
  72.                     best_params = params
  73.                     best_sse = sse
  74.  
  75.         except Exception:
  76.             pass
  77.  
  78.     return (best_distribution.name, best_params)
  79.  
  80. def make_pdf(dist, params, size=10000):
  81.     """Generate distributions's Probability Distribution Function """
  82.  
  83.     # Separate parts of parameters
  84.     arg = params[:-2]
  85.     loc = params[-2]
  86.     scale = params[-1]
  87.  
  88.     # Get sane start and end points of distribution
  89.     start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
  90.     end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)
  91.  
  92.     # Build PDF and turn into pandas Series
  93.     x = np.linspace(start, end, size)
  94.     y = dist.pdf(x, loc=loc, scale=scale, *arg)
  95.     pdf = pd.Series(y, x)
  96.  
  97.     return pdf
  98.  
  99. # Load data from statsmodels datasets
  100. data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())
  101.  
  102. # Plot for comparison
  103. plt.figure(figsize=(12,8))
  104. ax = data.plot(kind='hist', bins=50, normed=True, alpha=0.5, color=plt.rcParams['axes.color_cycle'][1])
  105. # Save plot limits
  106. dataYLim = ax.get_ylim()
  107.  
  108. # Find best fit distribution
  109. best_fit_name, best_fit_params = best_fit_distribution(data, 200, ax)
  110. best_dist = getattr(st, best_fit_name)
  111.  
  112. # Update plots
  113. ax.set_ylim(dataYLim)
  114. ax.set_title(u'El Niño sea temp.\n All Fitted Distributions')
  115. ax.set_xlabel(u'Temp (°C)')
  116. ax.set_ylabel('Frequency')
  117.  
  118. # Make PDF with best params
  119. pdf = make_pdf(best_dist, best_fit_params)
  120.  
  121. # Display
  122. plt.figure(figsize=(12,8))
  123. ax = pdf.plot(lw=2, label='PDF', legend=True)
  124. data.plot(kind='hist', bins=50, normed=True, alpha=0.5, label='Data', legend=True, ax=ax)
  125.  
  126. param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
  127. param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fit_params)])
  128. dist_str = '{}({})'.format(best_fit_name, param_str)
  129.  
  130. ax.set_title(u'El Niño sea temp. with best fit distribution \n' + dist_str)
  131. ax.set_xlabel(u'Temp. (°C)')
  132. ax.set_ylabel('Frequency')
Advertisement
Add Comment
Please, Sign In to add comment