diff --git a/allfitdist.m b/allfitdist.m new file mode 100644 index 0000000..53a498e --- /dev/null +++ b/allfitdist.m @@ -0,0 +1,453 @@ +function [D PD] = allfitdist(data,sortby,varargin) +%ALLFITDIST Fit all valid parametric probability distributions to data. +% [D PD] = ALLFITDIST(DATA) fits all valid parametric probability +% distributions to the data in vector DATA, and returns a struct D of +% fitted distributions and parameters and a struct of objects PD +% representing the fitted distributions. PD is an object in a class +% derived from the ProbDist class. +% +% [...] = ALLFITDIST(DATA,SORTBY) returns the struct of valid distributions +% sorted by the parameter SORTBY +% NLogL - Negative of the log likelihood +% BIC - Bayesian information criterion (default) +% AIC - Akaike information criterion +% AICc - AIC with a correction for finite sample sizes +% +% [...] = ALLFITDIST(...,'DISCRETE') specifies it is a discrete +% distribution and does not attempt to fit a continuous distribution +% to the data +% +% [...] = ALLFITDIST(...,'PDF') or (...,'CDF') plots either the PDF or CDF +% of a subset of the fitted distribution. The distributions are plotted in +% order of fit, according to SORTBY. +% +% List of distributions it will try to fit +% Continuous (default) +% Beta +% Birnbaum-Saunders +% Exponential +% Extreme value +% Gamma +% Generalized extreme value +% Generalized Pareto +% Inverse Gaussian +% Logistic +% Log-logistic +% Lognormal +% Nakagami +% Normal +% Rayleigh +% Rician +% t location-scale +% Weibull +% +% Discrete ('DISCRETE') +% Binomial +% Negative binomial +% Poisson +% +% Optional inputs: +% [...] = ALLFITDIST(...,'n',N,...) +% For the 'binomial' distribution only: +% 'n' A positive integer specifying the N parameter (number +% of trials). Not allowed for other distributions. If +% 'n' is not given it is estimate by Method of Moments. +% If the estimated 'n' is negative then the maximum +% value of data will be used as the estimated value. +% [...] = ALLFITDIST(...,'theta',THETA,...) +% For the 'generalized pareto' distribution only: +% 'theta' The value of the THETA (threshold) parameter for +% the generalized Pareto distribution. Not allowed for +% other distributions. If 'theta' is not given it is +% estimated by the minimum value of the data. +% +% Note: ALLFITDIST does not handle nonparametric kernel-smoothing, +% use FITDIST directly instead. +% +% +% EXAMPLE 1 +% Given random data from an unknown continuous distribution, find the +% best distribution which fits that data, and plot the PDFs to compare +% graphically. +% data = normrnd(5,3,1e4,1); %Assumed from unknown distribution +% [D PD] = allfitdist(data,'PDF'); %Compute and plot results +% D(1) %Show output from best fit +% +% EXAMPLE 2 +% Given random data from a discrete unknown distribution, with frequency +% data, find the best discrete distribution which would fit that data, +% sorted by 'NLogL', and plot the PDFs to compare graphically. +% data = nbinrnd(20,.3,1e4,1); +% values=unique(data); freq=histc(data,values); +% [D PD] = allfitdist(values,'NLogL','frequency',freq,'PDF','DISCRETE'); +% PD{1} +% +% EXAMPLE 3 +% Although the Geometric Distribution is not listed, it is a special +% case of fitting the more general Negative Binomial Distribution. The +% parameter 'r' should be close to 1. Show by example. +% data=geornd(.7,1e4,1); %Random from Geometric +% [D PD]= allfitdist(data,'PDF','DISCRETE'); +% PD{1} +% +% EXAMPLE 4 +% Compare the resulting distributions under two different assumptions +% of discrete data. The first, that it is known to be derived from a +% Binomial Distribution with known 'n'. The second, that it may be +% Binomial but 'n' is unknown and should be estimated. Note the second +% scenario may not yield a Binomial Distribution as the best fit, if +% 'n' is estimated incorrectly. (Best to run example a couple times +% to see effect) +% data = binornd(10,.3,1e2,1); +% [D1 PD1] = allfitdist(data,'n',10,'DISCRETE','PDF'); %Force binomial +% [D2 PD2] = allfitdist(data,'DISCRETE','PDF'); %May be binomial +% PD1{1}, PD2{1} %Compare distributions +% + +% Mike Sheppard +% Last Modified: 17-Feb-2012 + + + + +%% Check Inputs +if nargin == 0 + data = 10.^((normrnd(2,10,1e4,1))/10); + sortby='BIC'; + varargin={'CDF'}; +end +if nargin==1 + sortby='BIC'; +end +sortbyname={'NLogL','BIC','AIC','AICc'}; +if ~any(ismember(lower(sortby),lower(sortbyname))) + oldvar=sortby; %May be 'PDF' or 'CDF' or other commands + if isempty(varargin) + varargin={oldvar}; + else + varargin=[oldvar varargin]; + end + sortby='BIC'; +end +if nargin < 2, sortby='BIC'; end +distname={'beta', 'birnbaumsaunders', 'exponential', ... + 'extreme value', 'gamma', 'generalized extreme value', ... + 'generalized pareto', 'inversegaussian', 'logistic', 'loglogistic', ... + 'lognormal', 'nakagami', 'normal', ... + 'rayleigh', 'rician', 'tlocationscale', 'weibull'}; +if ~any(strcmpi(sortby,sortbyname)) + error('allfitdist:SortBy','Sorting must be either NLogL, BIC, AIC, or AICc'); +end +%Input may be mixed of numeric and strings, find only strings +vin=varargin; +strs=find(cellfun(@(vs)ischar(vs),vin)); +vin(strs)=lower(vin(strs)); +%Next check to see if 'PDF' or 'CDF' is listed +numplots=sum(ismember(vin(strs),{'pdf' 'cdf'})); +if numplots>=2 + error('ALLFITDIST:PlotType','Either PDF or CDF must be given'); +end +if numplots==1 + plotind=true; %plot indicator + indxpdf=ismember(vin(strs),'pdf'); + plotpdf=any(indxpdf); + indxcdf=ismember(vin(strs),'cdf'); + vin(strs(indxpdf|indxcdf))=[]; %Delete 'PDF' and 'CDF' in vin +else + plotind=false; +end +%Check to see if discrete +strs=find(cellfun(@(vs)ischar(vs),vin)); +indxdis=ismember(vin(strs),'discrete'); +discind=false; +if any(indxdis) + discind=true; + distname={'binomial', 'negative binomial', 'poisson'}; + vin(strs(indxdis))=[]; %Delete 'DISCRETE' in vin +end +strs=find(cellfun(@(vs)ischar(vs),vin)); +n=numel(data); %Number of data points +data = data(:); +D=[]; +%Check for NaN's to delete +deldatanan=isnan(data); +%Check to see if frequency is given +indxf=ismember(vin(strs),'frequency'); +if any(indxf) + freq=vin{1+strs((indxf))}; freq=freq(:); + if numel(freq)~=numel(data) + error('ALLFITDIST:PlotType','Matrix dimensions must agree'); + end + delfnan=isnan(freq); + data(deldatanan|delfnan)=[]; freq(deldatanan|delfnan)=[]; + %Save back into vin + vin{1+strs((indxf))}=freq; +else + data(deldatanan)=[]; +end + + + + + +%% Run through all distributions in FITDIST function +warning('off','all'); %Turn off all future warnings +for indx=1:length(distname) + try + dname=distname{indx}; + switch dname + case 'binomial' + PD=fitbinocase(data,vin,strs); %Special case + case 'generalized pareto' + PD=fitgpcase(data,vin,strs); %Special case + otherwise + %Built-in distribution using FITDIST + PD = fitdist(data,dname,vin{:}); + end + + NLL=PD.NLogL; % -Log(L) + %If NLL is non-finite number, produce error to ignore distribution + if ~isfinite(NLL) + error('non-finite NLL'); + end + num=length(D)+1; + PDs(num) = {PD}; %#ok<*AGROW> + k=numel(PD.Params); %Number of parameters + D(num).DistName=PD.DistName; + D(num).NLogL=NLL; + D(num).BIC=-2*(-NLL)+k*log(n); + D(num).AIC=-2*(-NLL)+2*k; + D(num).AICc=(D(num).AIC)+((2*k*(k+1))/(n-k-1)); + D(num).ParamNames=PD.ParamNames; + D(num).ParamDescription=PD.ParamDescription; + D(num).Params=PD.Params; + D(num).Paramci=PD.paramci; + D(num).ParamCov=PD.ParamCov; + D(num).Support=PD.Support; + catch err %#ok + %Ignore distribution + end +end +warning('on','all'); %Turn back on warnings +if numel(D)==0 + error('ALLFITDIST:NoDist','No distributions were found'); +end + + + + + +%% Sort distributions +indx1=1:length(D); %Identity Map +sortbyindx=find(strcmpi(sortby,sortbyname)); +switch sortbyindx + case 1 + [~,indx1]=sort([D.NLogL]); + case 2 + [~,indx1]=sort([D.BIC]); + case 3 + [~,indx1]=sort([D.AIC]); + case 4 + [~,indx1]=sort([D.AICc]); +end +%Sort +D=D(indx1); PD = PDs(indx1); + + + + + +%% Plot if requested +if plotind; + plotfigs(data,D,PD,vin,strs,plotpdf,discind) +end + + +end + + + + + +function PD=fitbinocase(data,vin,strs) +%% Special Case for Binomial +% 'n' is estimated if not given +vinbino=vin; +%Check to see if 'n' is given +indxn=any(ismember(vin(strs),'n')); +%Check to see if 'frequency' is given +indxfreq=ismember(vin(strs),'frequency'); +if ~indxn + %Use Method of Moment estimator + %E[x]=np, V[x]=np(1-p) -> nhat=E/(1-(V/E)); + if isempty(indxfreq)||~any(indxfreq) + %Raw data + mnx=mean(data); + nhat=round(mnx/(1-(var(data)/mnx))); + else + %Frequency data + freq=vin{1+strs(indxfreq)}; + m1=dot(data,freq)/sum(freq); + m2=dot(data.^2,freq)/sum(freq); + mnx=m1; vx=m2-(m1^2); + nhat=round(mnx/(1-(vx/mnx))); + end + %If nhat is negative, use maximum value of data + if nhat<=0, nhat=max(data(:)); end + vinbino{end+1}='n'; vinbino{end+1}=nhat; +end +PD = fitdist(data,'binomial',vinbino{:}); +end + + + + + +function PD=fitgpcase(data,vin,strs) +%% Special Case for Generalized Pareto +% 'theta' is estimated if not given +vingp=vin; +%Check to see if 'theta' is given +indxtheta=any(ismember(vin(strs),'theta')); +if ~indxtheta + %Use minimum value for theta, minus small part + thetahat=min(data(:))-10*eps; + vingp{end+1}='theta'; vingp{end+1}=thetahat; +end +PD = fitdist(data,'generalized pareto',vingp{:}); +end + + + + + +function plotfigs(data,D,PD,vin,strs,plotpdf,discind) +%Plot functionality for continuous case due to Jonathan Sullivan +%Modified by author for discrete case + +%Maximum number of distributions to include +%max_num_dist=Inf; %All valid distributions +max_num_dist=4; + +%Check to see if frequency is given +indxf=ismember(vin(strs),'frequency'); +if any(indxf) + freq=vin{1+strs((indxf))}; +end + +figure + +%% Probability Density / Mass Plot +if plotpdf + if ~discind + %Continuous Data + nbins = max(min(length(data)./10,100),50); + xi = linspace(min(data),max(data),nbins); + dx = mean(diff(xi)); + xi2 = linspace(min(data),max(data),nbins*10)'; + fi = histc(data,xi-dx); + fi = fi./sum(fi)./dx; + inds = 1:min([max_num_dist,numel(PD)]); + ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0); + ys = cat(2,ys{:}); + bar(xi,fi,'FaceColor',[160 188 254]/255,'EdgeColor','k'); + hold on; + plot(xi2,ys,'LineWidth',1.5) + legend(['empirical',{D(inds).DistName}],'Location','NE') + xlabel('Value'); + ylabel('Probability Density'); + title('Probability Density Function'); + grid on + else + %Discrete Data + xi2=min(data):max(data); + %xi2=unique(x)'; %If only want observed x-values to be shown + indxf=ismember(vin(strs),'frequency'); + if any(indxf) + fi=zeros(size(xi2)); + fi((ismember(xi2,data)))=freq; fi=fi'./sum(fi); + else + fi=histc(data,xi2); fi=fi./sum(fi); + end + inds = 1:min([max_num_dist,numel(PD)]); + ys = cellfun(@(PD) pdf(PD,xi2),PD(inds),'UniformOutput',0); + ys=cat(1,ys{:})'; + bar(xi2,[fi ys]); + legend(['empirical',{D(inds).DistName}],'Location','NE') + xlabel('Value'); + ylabel('Probability Mass'); + title('Probability Mass Function'); + grid on + end +else + +%Cumulative Distribution + if ~discind + %Continuous Data + [fi xi] = ecdf(data); + inds = 1:min([max_num_dist,numel(PD)]); + ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0); + ys = cat(2,ys{:}); + if max(xi)/min(xi) > 1e4; lgx = true; else lgx = false; end + subplot(2,1,1) + if lgx + semilogx(xi,fi,'k',xi,ys) + else + plot(xi,fi,'k',xi,ys) + end + legend(['empirical',{D(inds).DistName}],'Location','NE') + xlabel('Value'); + ylabel('Cumulative Probability'); + title('Cumulative Distribution Function'); + grid on + subplot(2,1,2) + y = 1.1*bsxfun(@minus,ys,fi); + if lgx + semilogx(xi,bsxfun(@minus,ys,fi)) + else + plot(xi,bsxfun(@minus,ys,fi)) + end + ybnds = max(abs(y(:))); + ax = axis; + axis([ax(1:2) -ybnds ybnds]); + legend({D(inds).DistName},'Location','NE') + xlabel('Value'); + ylabel('Error'); + title('CDF Error'); + grid on + else + %Discrete Data + indxf=ismember(vin(strs),'frequency'); + if any(indxf) + [fi xi] = ecdf(data,'frequency',freq); + else + [fi xi] = ecdf(data); + end + %Check unique xi, combine fi + [xi,ign,indx]=unique(xi); %#ok + fi=accumarray(indx,fi); + inds = 1:min([max_num_dist,numel(PD)]); + ys = cellfun(@(PD) cdf(PD,xi),PD(inds),'UniformOutput',0); + ys=cat(2,ys{:}); + subplot(2,1,1) + stairs(xi,[fi ys]); + legend(['empirical',{D(inds).DistName}],'Location','NE') + xlabel('Value'); + ylabel('Cumulative Probability'); + title('Cumulative Distribution Function'); + grid on + subplot(2,1,2) + y = 1.1*bsxfun(@minus,ys,fi); + stairs(xi,bsxfun(@minus,ys,fi)) + ybnds = max(abs(y(:))); + ax = axis; + axis([ax(1:2) -ybnds ybnds]); + legend({D(inds).DistName},'Location','NE') + xlabel('Value'); + ylabel('Error'); + title('CDF Error'); + grid on + end +end + +end diff --git a/distribution_loc.m b/distribution_loc.m new file mode 100644 index 0000000..b994c19 --- /dev/null +++ b/distribution_loc.m @@ -0,0 +1,97 @@ +% pd = makedist('normal', 'mu', 0, 'sigma', 53.8836); % cm + +pd = makedist('tlocationscale', 'mu', 0, 'sigma', 42.8875, 'nu', 5.3095); + +% params +plotflag = 0; +samples = 100000; +farthest_dist = [500, 2000]; % cm +devLocations = [0, 0; + 500, 0; + 500, 2000; + 0, 2000]; + +% setup +for i = size(devLocations, 1):-1:1 + for j = size(devLocations, 1):-1:(i+1) + diff = devLocations(i, :) - devLocations(j, :); + pairwiseDist(i, j) = sqrt(sum(diff.^2)); + end +end +targetLocs = rand(samples, 2) .* farthest_dist; +% targetLocs = [250, 100; 250, 200; 250, 300; 250, 400; 250, 500]; +% samples = size(targetLocs, 1); +trueDist = zeros(size(devLocations, 1), samples); +estDist = zeros(size(devLocations, 1), samples); + +% % estimate measurements +for i = 1:size(devLocations, 1) + trueDist(i, :) = sqrt(sum((targetLocs - devLocations(i, :)).^2, 2)); + if isa(pd, 'prob.NormalDistribution') + randvar_vec = random(pd, size(trueDist(i, :))); + else + signs = (rand(size(trueDist(i, :))) < 0.5) * 2 - 1; + randvar_vec = random(pd, size(trueDist(i, :))) .* signs; + end + estDist(i, :) = trueDist(i, :) + randvar_vec; +end + +% estimate locations +estLocs = zeros(size(targetLocs)); +if plotflag + figure(3); clf; hold on; + scatter(devLocations(:, 1), devLocations(:, 2), 'b*'); +end +for i = 1:samples + points = []; + for j = 1:size(devLocations, 1) + for k = (j+1):size(devLocations, 1) + d = pairwiseDist(j, k); + if (d > estDist(j, i) + estDist(k, i)) ||... + (d < abs(estDist(j, i) - estDist(k, i))) + continue + end + a = (estDist(j, i) * estDist(j, i) -... + estDist(k, i) * estDist(k, i) +... + d * d) / (2 * d); + h = sqrt(estDist(j, i) * estDist(j, i) - a * a); + x0 = devLocations(j, 1) +... + a * (devLocations(k, 1) - devLocations(j, 1)) / d; + y0 = devLocations(j, 2) +... + a * (devLocations(k, 2) - devLocations(j, 2)) / d; + rx = -(devLocations(k, 2) - devLocations(j, 2)) * (h / d); + ry = -(devLocations(k, 1) - devLocations(j, 1)) * (h / d); + points = [points; x0 + rx, y0 - ry; x0 - rx, y0 + ry]; + end + end + if ~isempty(points) && size(devLocations, 1) == 2 + if devLocations(1, 2) == devLocations(2, 2) + points(points(:, 2) < devLocations(1, 2), :) = []; + elseif devLocations(1, 1) == devLocations(2, 1) + points(points(:, 1) < devLocations(1, 1), :) = []; + end + end + estLoc = median(points, 1); + if isempty(estLoc) + estLocs(i, :) = nan; + else + estLocs(i, :) = estLoc; + if plotflag + scatter(targetLocs(i, 1), targetLocs(i, 2), 'ks'); + scatter(estLocs(i, 1), estLocs(i, 2), 'r.'); + pause(0.001) + end + end +end + +logistics = isnan(estLocs(:, 1)); +estLocs(logistics, :) = []; +targetLocs(logistics, :) = []; +fprintf('* %d cannot est locations\n', sum(logistics)); + +err = sqrt(sum((estLocs - targetLocs).^2, 2)); +figure(4); hold on; cdfplot(err); + +fitresult = allfitdist(err, 'pdf'); +% essentially, we are getting a gamma distribution in generall +% the result should be validated in real-life experiments also \ No newline at end of file