StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
statistics.cpp
Go to the documentation of this file.
1 //statistics.cpp
2 //Copyright (c) 2007-2012 Paul C Lott
3 //University of California, Davis
4 //Genome and Biomedical Sciences Facility
5 //UC Davis Genome Center
6 //Ian Korf Lab
7 //Website: www.korflab.ucdavis.edu
8 //Email: lottpaul@gmail.com
9 //
10 //Permission is hereby granted, free of charge, to any person obtaining a copy of
11 //this software and associated documentation files (the "Software"), to deal in
12 //the Software without restriction, including without limitation the rights to
13 //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14 //the Software, and to permit persons to whom the Software is furnished to do so,
15 //subject to the following conditions:
16 //
17 //The above copyright notice and this permission notice shall be included in all
18 //copies or substantial portions of the Software.
19 //
20 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22 //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23 //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24 //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25 //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 
27 #include "statistics.h"
28 namespace StochHMM{
29 
30  template <class T>
31  T min(std::vector<T> &set){
32  T min =set[0];
33  for(long i=set.size()-1;i>0;i--){
34  if (set[i]<min){ min=set[i];}
35  }
36  return min;
37  }
38 
39  template <class T>
40  T max(std::vector<T> &set){
41  T max=set[0];
42  for(long i=set.size()-1;i>0;i--){
43  if (set[i]>max){ max=set[i];}
44  }
45  return min;
46  }
47 
48  template <class T>
49  T construct_histogram (std::vector<T> &set,int N_bins){
50  T mini=min(set);
51  T maxi=max(set);
52  T delta=(maxi-mini+1)/N_bins;
53  std::vector<T> bin (N_bins,0);
54  for(int i=set.size()-1;i>=0;i--){
55  T value=floor((set[i]-mini)/delta);
56  bin(value)=bin(value)+1/(set.size()*delta);
57  }
58  return bin;
59  }
60 
61  template <class T>
62  T smooth_histogram (std::vector<T> histo, int intervals, int window_size, int iterations){
63  std::vector<T> s_histo=histo;
64  for (int i=1;i<=iterations;i++){
65  for (int b=0;b<=intervals-window_size;b++){
66  int c=b+floor((window_size-1)/2);
67  s_histo[c]=0;
68  for (int j=b;j<=b+window_size-1;i++){
69  s_histo[c]=s_histo[c]+histo[j]/window_size;
70  }
71  }
72  for (int b=0;b<=((window_size-1)/2)-1;b++){
73  s_histo=s_histo[floor((window_size-1)/2)];
74  }
75  for (int b=intervals-window_size+1+floor((window_size-1)/2); b<=intervals-1;i++){
76  s_histo[b]=s_histo[intervals-window_size+floor((window_size-1) / 2)];
77  }
78  histo=s_histo;
79  }
80  T sum=0;
81  for (int b=0;b<=intervals-1;b++){
82  sum+=sum+s_histo[b];
83  }
84  for (int b=0;b<=intervals-1;b++){
85  s_histo[b]/=sum;
86  }
87  return s_histo;
88  }
89 
90  float entropy (std::vector<float> &set){
91  float entropy=0;
92  for(size_t i=0;i<set.size();i++){
93  entropy+=set[i]*(log(set[i])/log(2));
94  }
95  return entropy*-1;
96  }
97 
98  float rel_entropy (std::vector<float> &set, std::vector<float> &set2){
99  float rel_entropy=0;
100  if (set.size()!=set2.size()){
101  std::cerr << "Distributions aren't the same size";
102  }
103 
104  for(size_t i=0;i<set.size();i++){
105  rel_entropy+=0.5*(set[i]* (log (set[i]/set2[i]) /log(2) )+set2[i]*(log(set2[i]/set[i])/log(2)));
106  }
107  return rel_entropy;
108  }
109 
110 
111  /////////////////////////////////////////// Integration & Summation //////////////////////////////////////////////
112  //Need to adapt to take up to three paramenter and pass them to the function.
113  //If one parameter is given then only pass one to function and so on.
114 
115  //Fix: variable handed to function to include only std::vector<double>
116  double _integrate (double (*funct)(double, std::vector<double>),double upper, double lower,std::vector<double> param){
117  double mid=(lower+upper)/2.0;
118  double h3=fabs(upper-lower)/6.0;
119  return h3*(funct(lower,param)+4*funct(mid,param)+funct(upper,param));
120  }
121 
122  double integrate (double (*funct)(double,std::vector<double>),double lower, double upper ,std::vector<double> param, double max_error=0.001, double sum=0){
123  double mid=(upper+lower)/2.0;
124  double left=_integrate(funct,lower, mid, param);
125  double right=_integrate(funct,mid,upper, param);
126  if (fabs(left+right-sum)<=15*max_error){
127  return left+right +(left+right-sum)/15;
128  }
129  return integrate(funct,lower,mid,param,max_error/2,left) + integrate(funct,mid,upper,param,max_error/2,right);
130  }
131 
132 
133 
134  // Adaptive Simpson's numerical integration method
135  double simpson (double (*funct)(double,double,double),double alpha, double beta, double lower, double upper){
136  double mid=(lower+upper)/2.0;
137  double h3=fabs(upper-lower)/6.0;
138  return h3*(funct(lower,alpha,beta)+4*funct(mid,alpha,beta)+funct(upper,alpha,beta));
139  }
140 
141  double adapt_simpson (double (*funct)(double,double,double),double alpha, double beta, double lower, double upper, double max_error, double sum){
142  double mid=(upper+lower)/2.0;
143  double left=simpson(funct,alpha,beta,lower, mid);
144 
145  double right=simpson(funct,alpha,beta,mid,upper);
146  if (fabs(left+right-sum)<=15*max_error){
147  return left+right +(left+right-sum)/15;
148  }
149  return adapt_simpson(funct,alpha,beta,lower,mid,max_error/2,left) + adapt_simpson(funct,alpha,beta,mid,upper,max_error/2,right);
150  }
151 
152 
153  /////////////////////////////////////// Distributions //////////////////////////////////////////////
154 
155 
156  /* ---------------- Discrete Distributions ---------------- */
157  /* ------------------- FINITE INTERVALS ------------------- */
158 
159  //!Discrete Uniform CDF
160  //!param a Minimum position
161  //!param b Maximum position
162  //!param position Position to calculate
163  float discrete_uniform_cdf(int a, int b, int position){
164  if (position<a){
165  return 0;
166  }
167  else if (position>=b){
168  return 1;
169  }
170  else{
171  return (float)(position-a+1)/(b-a+1);
172  }
173  }
174 
175 
176 
177  //!Binomial coefficient
178  //!number of combinations possible given k unordered outcomes from n possibilities
179  //!param n number of items
180  //!param k number in set
181  float bin_coef (int n, int k){
182  float c=0;
183  for(int i=k+1;i<=n;i++) {c+=log(i);}
184  for(int j=1;j<=(n-k);j++) {c-=log(j);}
185  return exp(c);
186  }
187 
188  //!Binomial probability mass function
189  //!param k Number of successes
190  //!param n Number of trials
191  //!param p Probability of success
192  float binomial_pdf(int k,int n, float p){
193  return bin_coef(n,k)*pow(p,k)*pow(1-p,n-k);
194  }
195 
196  //!Binomial probaility mass function
197  //!param k Number of successes
198  //!param param std::vector<float> (n=number of trials, p=probability of success)
199  float binomial_pdf(int k, std::vector<float> param){
200  return binomial_pdf(k, param[0],param[1]);
201  }
202 
203 
204  //!Binomial Cumulative Distribution Function
205  //!param k Number of successes
206  //!param n Number of trials
207  //!param p Probability of success
208  //!If we can get an incomplete beta function. We can update this function and
209  //calculate without using the summation.
210  float binomial_cdf(int k,int n, float p){
211  std::vector<float> param (2,0.0);
212  param[0]=n;
213  param[1]=p;
214  return summation(binomial_pdf,0,k,param);
215  }
216 
217 
218 
219  //!Hypergeometric Cumulative Distribution Function
220  //!param n Number of draws from Population
221  //!param N Size of population
222  //!param m Number of successes in Population
223  //!param k Number of successes
224  double hypergeometric_cdf(int n, int N, int m, int k){
225  if (n==N){
226  return 1.0;
227  }
228  else{
229  double prob=0;
230  for(int i=1; i<=n;i++){
231  prob+=(bin_coef(m, k)*bin_coef(N-m,i-k))/bin_coef(N, i);
232  }
233  return prob;
234  }
235  }
236 
237 
238 
239  //Wikipedia Windschitl approximation to gamma function
240  double gamma_func (double x){
241  if (x>0 && x<=0.5){
242  return sin(PI*x)*gamma_func(1-x); }
243  else {
244  return sqrt((2*PI)/x)*pow(((x/exp(1))*sqrt(x*sinh(1/x)+1/810*pow(x,6))),x);
245  }
246  }
247 
248  double gamma_pdf (double x, double alpha, double beta) {
249  if (x>0){
250  return (pow(beta,alpha)/gamma_func(alpha))*pow(x,alpha-1)*exp(-beta*x);
251  }
252  else {
253  return 0;
254  }
255  }
256 
257  double gamma_pdf (double x, std::vector<double> param){ //double x, double alpha, double beta) {
258  return gamma_pdf(x,param[0],param[1]);
259  }
260 
261  double gamma_cdf (double x, double alpha, double beta){
262  std::vector<double> parameters (2,0.0);
263  parameters[0]=alpha;
264  parameters[1]=beta;
265  return integrate(gamma_pdf,0.0,x,parameters);
266  }
267 
268  double chi2_pdf(double x, double df){
269  return gamma_pdf(x,df/2.0,0.5);
270  }
271 
272  double chi2_cdf (double x, double df){
273  return gamma_cdf(x,df/2.0,0.5);
274  }
275 
276  double beta_pdf(double x, double alpha, double beta){
277  return (gamma_func(alpha+beta)/(gamma_func(alpha)*gamma_func(beta)))*pow(x,alpha-1)*pow(1-x,beta-1);
278  }
279 
280  double beta_pdf(double x, std::vector<double> param){
281  return beta_pdf(x,param[0],param[1]);
282  }
283 
284  double beta_cdf(double x, double alpha, double beta){
285  std::vector<double> parameters (2,0.0);
286  parameters[0]=alpha;
287  parameters[1]=beta;
288  return integrate(beta_pdf, 0.0, x,parameters);
289  }
290 
291  double expon_pdf(double x, double beta){
292  return gamma_pdf(x,1.0,beta);
293  }
294 
295  double expon_cdf(double x, double beta){
296  return gamma_cdf(x,1.0,beta);
297  }
298 
299  double normal_pdf(double x, double mean, double variance){
300  return (1/(sqrt(2*PI*variance)))*exp(-1*pow(x-mean,2)/(2*variance));
301  }
302 
303 
304 
305 
306 
307 
308 
309  float summation (float (*funct)(int,std::vector<float>), int lower, int upper, std::vector<float> param){
310  float sum=0;
311  for(int i=lower;i<=upper;i++){
312  sum+=funct(i,param);
313  }
314  return sum;
315  }
316 
317 
318 
319 
320 
321 
322  double multinomial_pdf(std::vector<int> r, int n, std::vector<double> p){
323  double prob=1.0;
324  double denom=1.0;
325  if (r.size()==p.size()){return 0.0;}
326  else {
327 
328  for(int i=0;i<r.size();i++){
329  prob*=p[i];
330  denom*=gamma_func(r[i]);
331  }
332  }
333 
334  return gamma_func((n+1)/denom)*prob; ///check to make sure this is right;
335  }
336 
337  //////////////////////// Need to fix everything below this point ///////////////////////////////////
338  double _low_igamma (double x, std::vector<double> param){
339  double value=param[0];
340  return (pow(x,value-1)*exp(-x));
341  }
342 
343  double low_igamma (double x, double alpha){
344  std::vector<double> parameters (2,0.0);
345  parameters[0]=alpha;
346  return (integrate(_low_igamma,0.0,x,parameters));
347  }
348 
349  double upper_igamma (double x , double alpha){
350  return (1/gamma_func(alpha))-low_igamma(x,alpha);
351  }
352 
353  double erf (double x) {
354  return 1-(upper_igamma(0.5,pow(x,2))/sqrt(2));
355  }
356 
357  double std_normal_cdf (double x){
358  x/=sqrt(2);
359  double erf_value=erf(x);
360  return 0.5*(1.0+erf_value);
361  }
362 }