454 lines
14 KiB
Matlab
454 lines
14 KiB
Matlab
function [D PD] = allfitdist(data,sortby,varargin)
|
|
%ALLFITDIST Fit all valid parametric probability distributions to data.
|
|
% [D PD] = ALLFITDIST(DATA) fits all valid parametric probability
|
|
% distributions to the data in vector DATA, and returns a struct D of
|
|
% fitted distributions and parameters and a struct of objects PD
|
|
% representing the fitted distributions. PD is an object in a class
|
|
% derived from the ProbDist class.
|
|
%
|
|
% [...] = ALLFITDIST(DATA,SORTBY) returns the struct of valid distributions
|
|
% sorted by the parameter SORTBY
|
|
% NLogL - Negative of the log likelihood
|
|
% BIC - Bayesian information criterion (default)
|
|
% AIC - Akaike information criterion
|
|
% AICc - AIC with a correction for finite sample sizes
|
|
%
|
|
% [...] = ALLFITDIST(...,'DISCRETE') specifies it is a discrete
|
|
% distribution and does not attempt to fit a continuous distribution
|
|
% to the data
|
|
%
|
|
% [...] = ALLFITDIST(...,'PDF') or (...,'CDF') plots either the PDF or CDF
|
|
% of a subset of the fitted distribution. The distributions are plotted in
|
|
% order of fit, according to SORTBY.
|
|
%
|
|
% List of distributions it will try to fit
|
|
% Continuous (default)
|
|
% Beta
|
|
% Birnbaum-Saunders
|
|
% Exponential
|
|
% Extreme value
|
|
% Gamma
|
|
% Generalized extreme value
|
|
% Generalized Pareto
|
|
% Inverse Gaussian
|
|
% Logistic
|
|
% Log-logistic
|
|
% Lognormal
|
|
% Nakagami
|
|
% Normal
|
|
% Rayleigh
|
|
% Rician
|
|
% t location-scale
|
|
% Weibull
|
|
%
|
|
% Discrete ('DISCRETE')
|
|
% Binomial
|
|
% Negative binomial
|
|
% Poisson
|
|
%
|
|
% Optional inputs:
|
|
% [...] = ALLFITDIST(...,'n',N,...)
|
|
% For the 'binomial' distribution only:
|
|
% 'n' A positive integer specifying the N parameter (number
|
|
% of trials). Not allowed for other distributions. If
|
|
% 'n' is not given it is estimate by Method of Moments.
|
|
% If the estimated 'n' is negative then the maximum
|
|
% value of data will be used as the estimated value.
|
|
% [...] = ALLFITDIST(...,'theta',THETA,...)
|
|
% For the 'generalized pareto' distribution only:
|
|
% 'theta' The value of the THETA (threshold) parameter for
|
|
% the generalized Pareto distribution. Not allowed for
|
|
% other distributions. If 'theta' is not given it is
|
|
% estimated by the minimum value of the data.
|
|
%
|
|
% Note: ALLFITDIST does not handle nonparametric kernel-smoothing,
|
|
% use FITDIST directly instead.
|
|
%
|
|
%
|
|
% EXAMPLE 1
|
|
% Given random data from an unknown continuous distribution, find the
|
|
% best distribution which fits that data, and plot the PDFs to compare
|
|
% graphically.
|
|
% data = normrnd(5,3,1e4,1); %Assumed from unknown distribution
|
|
% [D PD] = allfitdist(data,'PDF'); %Compute and plot results
|
|
% D(1) %Show output from best fit
|
|
%
|
|
% EXAMPLE 2
|
|
% Given random data from a discrete unknown distribution, with frequency
|
|
% data, find the best discrete distribution which would fit that data,
|
|
% sorted by 'NLogL', and plot the PDFs to compare graphically.
|
|
% data = nbinrnd(20,.3,1e4,1);
|
|
% values=unique(data); freq=histc(data,values);
|
|
% [D PD] = allfitdist(values,'NLogL','frequency',freq,'PDF','DISCRETE');
|
|
% PD{1}
|
|
%
|
|
% EXAMPLE 3
|
|
% Although the Geometric Distribution is not listed, it is a special
|
|
% case of fitting the more general Negative Binomial Distribution. The
|
|
% parameter 'r' should be close to 1. Show by example.
|
|
% data=geornd(.7,1e4,1); %Random from Geometric
|
|
% [D PD]= allfitdist(data,'PDF','DISCRETE');
|
|
% PD{1}
|
|
%
|
|
% EXAMPLE 4
|
|
% Compare the resulting distributions under two different assumptions
|
|
% of discrete data. The first, that it is known to be derived from a
|
|
% Binomial Distribution with known 'n'. The second, that it may be
|
|
% Binomial but 'n' is unknown and should be estimated. Note the second
|
|
% scenario may not yield a Binomial Distribution as the best fit, if
|
|
% 'n' is estimated incorrectly. (Best to run example a couple times
|
|
% to see effect)
|
|
% data = binornd(10,.3,1e2,1);
|
|
% [D1 PD1] = allfitdist(data,'n',10,'DISCRETE','PDF'); %Force binomial
|
|
% [D2 PD2] = allfitdist(data,'DISCRETE','PDF'); %May be binomial
|
|
% PD1{1}, PD2{1} %Compare distributions
|
|
%
|
|
|
|
% Mike Sheppard
|
|
% Last Modified: 17-Feb-2012
|
|
|
|
|
|
|
|
|
|
%% Check Inputs
|
|
if nargin == 0
|
|
data = 10.^((normrnd(2,10,1e4,1))/10);
|
|
sortby='BIC';
|
|
varargin={'CDF'};
|
|
end
|
|
if nargin==1
|
|
sortby='BIC';
|
|
end
|
|
sortbyname={'NLogL','BIC','AIC','AICc'};
|
|
if ~any(ismember(lower(sortby),lower(sortbyname)))
|
|
oldvar=sortby; %May be 'PDF' or 'CDF' or other commands
|
|
if isempty(varargin)
|
|
varargin={oldvar};
|
|
else
|
|
varargin=[oldvar varargin];
|
|
end
|
|
sortby='BIC';
|
|
end
|
|
if nargin < 2, sortby='BIC'; end
|
|
distname={'beta', 'birnbaumsaunders', 'exponential', ...
|
|
'extreme value', 'gamma', 'generalized extreme value', ...
|
|
'generalized pareto', 'inversegaussian', 'logistic', 'loglogistic', ...
|
|
'lognormal', 'nakagami', 'normal', ...
|
|
'rayleigh', 'rician', 'tlocationscale', 'weibull'};
|
|
if ~any(strcmpi(sortby,sortbyname))
|
|
error('allfitdist:SortBy','Sorting must be either NLogL, BIC, AIC, or AICc');
|
|
end
|
|
%Input may be mixed of numeric and strings, find only strings
|
|
vin=varargin;
|
|
strs=find(cellfun(@(vs)ischar(vs),vin));
|
|
vin(strs)=lower(vin(strs));
|
|
%Next check to see if 'PDF' or 'CDF' is listed
|
|
numplots=sum(ismember(vin(strs),{'pdf' 'cdf'}));
|
|
if numplots>=2
|
|
error('ALLFITDIST:PlotType','Either PDF or CDF must be given');
|
|
end
|
|
if numplots==1
|
|
plotind=true; %plot indicator
|
|
indxpdf=ismember(vin(strs),'pdf');
|
|
plotpdf=any(indxpdf);
|
|
indxcdf=ismember(vin(strs),'cdf');
|
|
vin(strs(indxpdf|indxcdf))=[]; %Delete 'PDF' and 'CDF' in vin
|
|
else
|
|
plotind=false;
|
|
end
|
|
%Check to see if discrete
|
|
strs=find(cellfun(@(vs)ischar(vs),vin));
|
|
indxdis=ismember(vin(strs),'discrete');
|
|
discind=false;
|
|
if any(indxdis)
|
|
discind=true;
|
|
distname={'binomial', 'negative binomial', 'poisson'};
|
|
vin(strs(indxdis))=[]; %Delete 'DISCRETE' in vin
|
|
end
|
|
strs=find(cellfun(@(vs)ischar(vs),vin));
|
|
n=numel(data); %Number of data points
|
|
data = data(:);
|
|
D=[];
|
|
%Check for NaN's to delete
|
|
deldatanan=isnan(data);
|
|
%Check to see if frequency is given
|
|
indxf=ismember(vin(strs),'frequency');
|
|
if any(indxf)
|
|
freq=vin{1+strs((indxf))}; freq=freq(:);
|
|
if numel(freq)~=numel(data)
|
|
error('ALLFITDIST:PlotType','Matrix dimensions must agree');
|
|
end
|
|
delfnan=isnan(freq);
|
|
data(deldatanan|delfnan)=[]; freq(deldatanan|delfnan)=[];
|
|
%Save back into vin
|
|
vin{1+strs((indxf))}=freq;
|
|
else
|
|
data(deldatanan)=[];
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
%% Run through all distributions in FITDIST function
|
|
warning('off','all'); %Turn off all future warnings
|
|
for indx=1:length(distname)
|
|
try
|
|
dname=distname{indx};
|
|
switch dname
|
|
case 'binomial'
|
|
PD=fitbinocase(data,vin,strs); %Special case
|
|
case 'generalized pareto'
|
|
PD=fitgpcase(data,vin,strs); %Special case
|
|
otherwise
|
|
%Built-in distribution using FITDIST
|
|
PD = fitdist(data,dname,vin{:});
|
|
end
|
|
|
|
NLL=PD.NLogL; % -Log(L)
|
|
%If NLL is non-finite number, produce error to ignore distribution
|
|
if ~isfinite(NLL)
|
|
error('non-finite NLL');
|
|
end
|
|
num=length(D)+1;
|
|
PDs(num) = {PD}; %#ok<*AGROW>
|
|
k=numel(PD.Params); %Number of parameters
|
|
D(num).DistName=PD.DistName;
|
|
D(num).NLogL=NLL;
|
|
D(num).BIC=-2*(-NLL)+k*log(n);
|
|
D(num).AIC=-2*(-NLL)+2*k;
|
|
D(num).AICc=(D(num).AIC)+((2*k*(k+1))/(n-k-1));
|
|
D(num).ParamNames=PD.ParamNames;
|
|
D(num).ParamDescription=PD.ParamDescription;
|
|
D(num).Params=PD.Params;
|
|
D(num).Paramci=PD.paramci;
|
|
D(num).ParamCov=PD.ParamCov;
|
|
D(num).Support=PD.Support;
|
|
catch err %#ok<NASGU>
|
|
%Ignore distribution
|
|
end
|
|
end
|
|
warning('on','all'); %Turn back on warnings
|
|
if numel(D)==0
|
|
error('ALLFITDIST:NoDist','No distributions were found');
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
%% Sort distributions
|
|
indx1=1:length(D); %Identity Map
|
|
sortbyindx=find(strcmpi(sortby,sortbyname));
|
|
switch sortbyindx
|
|
case 1
|
|
[~,indx1]=sort([D.NLogL]);
|
|
case 2
|
|
[~,indx1]=sort([D.BIC]);
|
|
case 3
|
|
[~,indx1]=sort([D.AIC]);
|
|
case 4
|
|
[~,indx1]=sort([D.AICc]);
|
|
end
|
|
%Sort
|
|
D=D(indx1); PD = PDs(indx1);
|
|
|
|
|
|
|
|
|
|
|
|
%% Plot if requested
|
|
if plotind;
|
|
plotfigs(data,D,PD,vin,strs,plotpdf,discind)
|
|
end
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
function PD=fitbinocase(data,vin,strs)
|
|
%% Special Case for Binomial
|
|
% 'n' is estimated if not given
|
|
vinbino=vin;
|
|
%Check to see if 'n' is given
|
|
indxn=any(ismember(vin(strs),'n'));
|
|
%Check to see if 'frequency' is given
|
|
indxfreq=ismember(vin(strs),'frequency');
|
|
if ~indxn
|
|
%Use Method of Moment estimator
|
|
%E[x]=np, V[x]=np(1-p) -> nhat=E/(1-(V/E));
|
|
if isempty(indxfreq)||~any(indxfreq)
|
|
%Raw data
|
|
mnx=mean(data);
|
|
nhat=round(mnx/(1-(var(data)/mnx)));
|
|
else
|
|
%Frequency data
|
|
freq=vin{1+strs(indxfreq)};
|
|
m1=dot(data,freq)/sum(freq);
|
|
m2=dot(data.^2,freq)/sum(freq);
|
|
mnx=m1; vx=m2-(m1^2);
|
|
nhat=round(mnx/(1-(vx/mnx)));
|
|
end
|
|
%If nhat is negative, use maximum value of data
|
|
if nhat<=0, nhat=max(data(:)); end
|
|
vinbino{end+1}='n'; vinbino{end+1}=nhat;
|
|
end
|
|
PD = fitdist(data,'binomial',vinbino{:});
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
function PD=fitgpcase(data,vin,strs)
|
|
%% Special Case for Generalized Pareto
|
|
% 'theta' is estimated if not given
|
|
vingp=vin;
|
|
%Check to see if 'theta' is given
|
|
indxtheta=any(ismember(vin(strs),'theta'));
|
|
if ~indxtheta
|
|
%Use minimum value for theta, minus small part
|
|
thetahat=min(data(:))-10*eps;
|
|
vingp{end+1}='theta'; vingp{end+1}=thetahat;
|
|
end
|
|
PD = fitdist(data,'generalized pareto',vingp{:});
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
function plotfigs(data,D,PD,vin,strs,plotpdf,discind)
|
|
%Plot functionality for continuous case due to Jonathan Sullivan
|
|
%Modified by author for discrete case
|
|
|
|
%Maximum number of distributions to include
|
|
%max_num_dist=Inf; %All valid distributions
|
|
max_num_dist=4;
|
|
|
|
%Check to see if frequency is given
|
|
indxf=ismember(vin(strs),'frequency');
|
|
if any(indxf)
|
|
freq=vin{1+strs((indxf))};
|
|
end
|
|
|
|
figure
|
|
|
|
%% Probability Density / Mass Plot
|
|
if plotpdf
|
|
if ~discind
|
|
%Continuous Data
|
|
nbins = max(min(length(data)./10,100),50);
|
|
xi = linspace(min(data),max(data),nbins);
|
|
dx = mean(diff(xi));
|
|
xi2 = linspace(min(data),max(data),nbins*10)';
|
|
fi = histc(data,xi-dx);
|
|
fi = fi./sum(fi)./dx;
|
|
inds = 1:min([max_num_dist,numel(PD)]);
|
|
ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0);
|
|
ys = cat(2,ys{:});
|
|
bar(xi,fi,'FaceColor',[160 188 254]/255,'EdgeColor','k');
|
|
hold on;
|
|
plot(xi2,ys,'LineWidth',1.5)
|
|
legend(['empirical',{D(inds).DistName}],'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Probability Density');
|
|
title('Probability Density Function');
|
|
grid on
|
|
else
|
|
%Discrete Data
|
|
xi2=min(data):max(data);
|
|
%xi2=unique(x)'; %If only want observed x-values to be shown
|
|
indxf=ismember(vin(strs),'frequency');
|
|
if any(indxf)
|
|
fi=zeros(size(xi2));
|
|
fi((ismember(xi2,data)))=freq; fi=fi'./sum(fi);
|
|
else
|
|
fi=histc(data,xi2); fi=fi./sum(fi);
|
|
end
|
|
inds = 1:min([max_num_dist,numel(PD)]);
|
|
ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0);
|
|
ys=cat(1,ys{:})';
|
|
bar(xi2,[fi ys]);
|
|
legend(['empirical',{D(inds).DistName}],'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Probability Mass');
|
|
title('Probability Mass Function');
|
|
grid on
|
|
end
|
|
else
|
|
|
|
%Cumulative Distribution
|
|
if ~discind
|
|
%Continuous Data
|
|
[fi xi] = ecdf(data);
|
|
inds = 1:min([max_num_dist,numel(PD)]);
|
|
ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0);
|
|
ys = cat(2,ys{:});
|
|
if max(xi)/min(xi) > 1e4; lgx = true; else lgx = false; end
|
|
subplot(2,1,1)
|
|
if lgx
|
|
semilogx(xi,fi,'k',xi,ys)
|
|
else
|
|
plot(xi,fi,'k',xi,ys)
|
|
end
|
|
legend(['empirical',{D(inds).DistName}],'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Cumulative Probability');
|
|
title('Cumulative Distribution Function');
|
|
grid on
|
|
subplot(2,1,2)
|
|
y = 1.1*bsxfun(@minus,ys,fi);
|
|
if lgx
|
|
semilogx(xi,bsxfun(@minus,ys,fi))
|
|
else
|
|
plot(xi,bsxfun(@minus,ys,fi))
|
|
end
|
|
ybnds = max(abs(y(:)));
|
|
ax = axis;
|
|
axis([ax(1:2) -ybnds ybnds]);
|
|
legend({D(inds).DistName},'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Error');
|
|
title('CDF Error');
|
|
grid on
|
|
else
|
|
%Discrete Data
|
|
indxf=ismember(vin(strs),'frequency');
|
|
if any(indxf)
|
|
[fi xi] = ecdf(data,'frequency',freq);
|
|
else
|
|
[fi xi] = ecdf(data);
|
|
end
|
|
%Check unique xi, combine fi
|
|
[xi,ign,indx]=unique(xi); %#ok<ASGLU>
|
|
fi=accumarray(indx,fi);
|
|
inds = 1:min([max_num_dist,numel(PD)]);
|
|
ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0);
|
|
ys=cat(2,ys{:});
|
|
subplot(2,1,1)
|
|
stairs(xi,[fi ys]);
|
|
legend(['empirical',{D(inds).DistName}],'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Cumulative Probability');
|
|
title('Cumulative Distribution Function');
|
|
grid on
|
|
subplot(2,1,2)
|
|
y = 1.1*bsxfun(@minus,ys,fi);
|
|
stairs(xi,bsxfun(@minus,ys,fi))
|
|
ybnds = max(abs(y(:)));
|
|
ax = axis;
|
|
axis([ax(1:2) -ybnds ybnds]);
|
|
legend({D(inds).DistName},'Location','NE')
|
|
xlabel('Value');
|
|
ylabel('Error');
|
|
title('CDF Error');
|
|
grid on
|
|
end
|
|
end
|
|
|
|
end
|