StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
traceback_path.cpp
Go to the documentation of this file.
1 //traceback_path.cpp
2  //Copyright (c) 2007-2012 Paul C Lott
3  //University of California, Davis
4  //Genome and Biomedical Sciences Facility
5  //UC Davis Genome Center
6  //Ian Korf Lab
7  //Website: www.korflab.ucdavis.edu
8  //Email: lottpaul@gmail.com
9  //
10  //Permission is hereby granted, free of charge, to any person obtaining a copy of
11  //this software and associated documentation files (the "Software"), to deal in
12  //the Software without restriction, including without limitation the rights to
13  //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14  //the Software, and to permit persons to whom the Software is furnished to do so,
15  //subject to the following conditions:
16  //
17  //The above copyright notice and this permission notice shall be included in all
18  //copies or substantial portions of the Software.
19  //
20  //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22  //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23  //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24  //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25  //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 
27 #include "traceback_path.h"
28 
29 namespace StochHMM{
30 
31 
32  int WID=80;
33  //!Create a traceback_path
34  //!\param modl Pointer to model file
36  hmm=modl;
37  }
38 
39  //!Pushes a state index onto the end of the path
40  //!\param state Index of state to add
42  trace_path.push_back(state);
43  }
44 
45  //!Returns the size (ie. length) of the traceback_path
46  size_t traceback_path::size()const {
47  return trace_path.size();
48  }
49 
50  //!Clears all traceback path information
52  trace_path.clear();
53  }
54 
55 
56  //! Get the path to std::vector<int>
57  //! \param [out] pth std::vector<int> that represents path
58  void traceback_path::path(std::vector<int>& pth){
59  pth=trace_path;
60  return ;
61  }
62 
63  //TODO: change assignment to lhs
64  //! Get the label of the traceback_path and assigns to vector<string> ref
65  void traceback_path::label(std::vector<std::string>& pth){
66 
67  for(size_t k=trace_path.size()-1; k!=SIZE_MAX; k--){
68  state* st = hmm->getState(trace_path[k]);
69  pth.push_back(st->getLabel());
70  }
71  return;
72  }
73 
74  //!Get string of path label traceback
75  //! \param[out] pth std::string
76  void traceback_path::label(std::string& pth){
77 
78  if (pth.size()>0){
79  pth.clear();
80  }
81 
82 
83  for(size_t k=trace_path.size()-1; k!=SIZE_MAX; k--){
84  state* st = hmm->getState(trace_path[k]);
85  pth+=st->getLabel();
86  }
87  return;
88  }
89 
90  //!Get names of traceback path
91  //!\param [out] pth vector<string>
92  void traceback_path::name(std::vector<std::string>& pth){
93 
94  if ( hmm==NULL ){
95  std::cerr << "Model is NULL. traceback::name(...) must have valid HMM model defined.\n";
96  exit(2);
97  }
98 
99  for(size_t k = trace_path.size()-1; k != SIZE_MAX; k--){
100  state* st = hmm->getState(trace_path[k]);
101  pth.push_back(st->getName());
102  }
103  return;
104  }
105 
106 
107  //!Get GFF output of traceback path
108  //!\param [out] pth
109  void traceback_path::gff(std::vector<gff_feature>& pth,std::string& sequenceName){
110  std::string current_label="";
111  long long start=0;
112  size_t path_size=size();
113 
114  if ( hmm==NULL ){
115  std::cerr << "Model is NULL. traceback::gff(...) must have valid HMM model defined.\n";
116  exit(2);
117  }
118 
119  for(size_t k = path_size-1;k != SIZE_MAX; k--){
120  state* st = hmm->getState(trace_path[k]);
121  std::string new_label=st->getGFF();
122  if (new_label.compare("")==0){
123  if (start>0){
124  gff_feature ln;
125  ln.seqname=sequenceName;
126  ln.source=hmm->getName();
127  ln.feature=current_label;
128  ln.start=start;
129  ln.end=path_size-(k+1);
130  ln.score='.';
131  ln.strand='+';
132  ln.frame='.';
133  ln.attribute="";
134 
135  pth.push_back(ln);
136 
137  start=0;
138  current_label=new_label;
139  }
140  else {
141  continue;
142  }
143  }
144  else {
145  if(k==0){
146  gff_feature ln;
147  ln.seqname=sequenceName;
148  ln.source=hmm->getName();
149  ln.feature=current_label;
150  ln.start=start;
151  ln.end=path_size-(k+1);
152  ln.score='.';
153  ln.strand='+';
154  ln.frame='.';
155  ln.attribute="";
156 
157  pth.push_back(ln);
158  }
159  else if (start==0){
160  start=path_size-k;
161  current_label=new_label;
162  }
163  else if (new_label.compare(current_label)==0){
164  continue;
165  }
166  else {
167  gff_feature ln;
168  ln.seqname=sequenceName;
169  ln.source=hmm->getName();
170  ln.feature=current_label;
171  ln.start=start;
172  ln.end=path_size-(k+1);
173  ln.score='.';
174  ln.strand='+';
175  ln.frame='.';
176  ln.attribute="";
177 
178  pth.push_back(ln);
179 
180  start=path_size-k;
181  current_label=new_label;
182  }
183  }
184 
185  }
186 
187  return;
188  }
189 
190 
191  //!Print the path to stdout
193  int line=0;
194  for(size_t k = this->size()-1; k != SIZE_MAX; k--){
195  std::cout << trace_path[k]<< " ";
196  line++;
197  }
198  std::cout << std::endl << std::endl;
199  }
200 
201  //!Print the path to file stream
202  void traceback_path::fprint_path(std::ofstream &file){
203  int line=0;
204  for(size_t k=this->size()-1;k != SIZE_MAX; k--){
205  file << trace_path[k]<< " ";
206  line++;
207  }
208  file << std::endl;
209  }
210 
211  //!Check to see if paths are the same
213  if (rhs.trace_path==trace_path){
214  return true;
215  }
216  else {
217  return false;
218  }
219  }
220 
221  //!Comparison operators for path
223  if (trace_path<rhs.trace_path){
224  return true;
225  }
226  else {
227  return false;
228  }
229  }
230 
231  //!Comparison operators for path
233  if (trace_path>rhs.trace_path){
234  return true;
235  }
236  else {
237  return false;
238  }
239  }
240 
241  //!Comparison operators for path
243  if (trace_path<=rhs.trace_path){
244  return true;
245  }
246  else {
247  return false;
248  }
249  }
250 
251  //!Comparison operators for path
253  if (trace_path>=rhs.trace_path){
254  return true;
255  }
256  else {
257  return false;
258  }
259  }
260 
261 
262  //!Print traceback_path labels to stdout
264  int line=0;
265 
266  if ( hmm==NULL ){
267  std::cerr << "Model is NULL. traceback::print_label() must have valid HMM model defined.\n";
268  exit(2);
269  }
270 
271  for(size_t k = trace_path.size()-1;k != SIZE_MAX;k--){
272 // if(line==WID && WID>0){
273 // std::cout<< std::endl;
274 // line=0;
275 // }
276  state* st = hmm->getState(trace_path[k]);
277  std::cout << st->getLabel() << " ";
278  line++;
279  }
280  std::cout << std::endl << std::endl;
281 
282  }
283 
284  //!Outputs the gff formatted output for the traceback to stdout
285  void traceback_path::print_gff(std::string sequence_name, double score, int ranking, int times, double posterior) const {
286  std::string current_label="";
287  long long start=0;
288  size_t path_size=this->size();
289 
290  if ( hmm==NULL ){
291  std::cerr << "Model is NULL. traceback::print_gff(...) must have valid HMM model defined.\n";
292  exit(2);
293  }
294 
295  for(size_t k=path_size-1;k != SIZE_MAX;k--){
296  state* st = hmm->getState(trace_path[k]);
297  std::string new_label=st->getGFF();
298  if (new_label.compare("")==0){
299  if (start>0){
300  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size-(k+1) << "\t" << score << "\t+\t.\tRank:"<<ranking<<",Counts:" << times<<",Posterior:"<<posterior<<std::endl;
301  start=0;
302  current_label=new_label;
303  }
304  else {
305  continue;
306  }
307  }
308  else {
309  if(k==0){
310  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size << "\t" << score << "\t+\t.\tRank:"<<ranking<<",Counts:" << times<<",Posterior:"<<posterior<<std::endl;
311  }
312  else if (start==0){
313  start=path_size-k;
314  current_label=new_label;
315  }
316  else if (new_label.compare(current_label)==0){
317  continue;
318  }
319  else {
320  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size-(k+1) << "\t" << score << "\t+\t.\tRank:"<<ranking<<",Counts:" << times<<",Posterior:"<<posterior<<std::endl;
321  start=path_size-k;
322  current_label=new_label;
323  }
324  }
325 
326  }
327  }
328 
329 
330  //!outputs the gff formatted output for the traceback
331  void traceback_path::print_gff(std::string sequence_name) const {
332  std::string current_label="";
333  long long start=0;
334  size_t path_size=size();
335 
336  if (sequence_name[0] == '>'){
337  sequence_name = sequence_name.substr(1);
338 
339  if ( hmm==NULL ){
340  std::cerr << "Model is NULL. traceback::print_gff(...) must have valid HMM model defined.\n";
341  exit(2);
342  }
343  }
344 
345  for(size_t k = path_size-1; k != SIZE_MAX; k--){
346  state* st = hmm->getState(trace_path[k]);
347  std::string new_label=st->getGFF();
348 
349  //If no label then print
350  if (new_label.compare("")==0){
351  if (start>0){
352  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size-(k+1) << "\t.\t+\t."<<std::endl;
353  start=0;
354  current_label=new_label;
355  }
356  else {
357  continue;
358  }
359  }
360  else {
361  if (start==0){
362  start=path_size-k;
363  current_label=new_label;
364  }
365  else if (new_label.compare(current_label)==0){
366  if(k==0){
367  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size << "\t.\t+\t."<<std::endl;
368 
369  }
370 
371  continue;
372  }
373  else {
374  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size-(k+1) << "\t.\t+\t."<<std::endl;
375 
376  start=path_size-k;
377  current_label=new_label;
378 
379  if(k==0){
380  std::cout << sequence_name << "\tStochHMM\t" << current_label <<"\t"<< start << "\t" << path_size << "\t.\t+\t."<<std::endl;
381  }
382 
383  }
384  }
385 
386  }
387 
388  std::cout << std::endl<<std::endl;
389  }
390 
391 
392  /* Need to re-write to handle new HMM Types
393  //fix to handle higher order model
394  double traceback_path::path_prob (const HMM &model){
395  int size= trace_path.size();
396 
397  int seq_size= model.seq_size();
398  int alpha_size=model.alpha.size();
399 
400 
401  if (size!=seq_size){
402  cerr << "Sequence and Path different sizes\n";
403  exit(1);
404  }
405 
406  vector<vector<double> > log_emm=convert_order(model, trace_path[size-1], 0);
407  double prob=model.initial.get_trans(trace_path[size-1]) + log_emm[0][seq.val(0)];
408 
409 
410  for (unsigned int i=1;i<size;i++){
411  int index=0;
412 
413  if (model.states[trace_path[size-i-1]].order>i){
414  vector<vector<double> > emmiss=convert_order(model, trace_path[size-i-1], i);
415  for(int n=i;n>=1;n--){
416  index+=POWER[n-1][alpha_size-1]*seq.val(i-n);
417  }
418 
419  //cout <<i<<"\t"<<prob <<endl;
420  prob+=model.states[trace_path[size-i]].get_trans(trace_path[size-i-1])+emmiss[index][seq.val(i)];
421  }
422 
423 
424  else{
425 
426  for(int n=model.states[trace_path[size-i-1]].order;n>=1;n--){
427  index+=POWER[n-1][alpha_size-1]*seq.val(i-n);
428  //cout <<"Index\t"<< index<<"\t"<< k << endl;
429  }
430  //cout <<i<<"\t"<<prob <<endl;
431  prob+=model.states[trace_path[size-i]].get_trans(trace_path[size-i-1])+model.states[trace_path[size-i-1]].log_emm[index][seq.val(i)];
432  }
433  }
434 
435  return prob;
436  }
437  */
438 
439 
440  //!Create multiTraceback()
442  maxSize=0;
443  vectorIterator=0;
444  table=NULL;
445  }
446 
447  //!Destroy multiTraceback
449  if(table!=NULL){
450  delete table;
451  }
452  }
453 
454  //!Set position in multiTraceback to the beginning
456  vectorIterator=0;
457  return;
458  }
459 
460  //!Set position in multiTraceback to the ending
462  vectorIterator=paths.size();
463  return;
464  }
465 
466  //!Increment the iterator to next position
468  if (vectorIterator<maxSize){
469  vectorIterator++;
470  }
471  return;
472  }
473 
474  //!Decrement the iterator to previous position
476  if (vectorIterator>0){
477  vectorIterator--;
478  }
479  return;
480  }
481 
482  //!Set iterator to index val
483  //!\param val Index value to set
484  void multiTraceback::operator=(size_t val){
485  if (val<=maxSize){
486  vectorIterator=val;
487  }
488  return;
489  }
490 
491 
492  //!Get traceback_path at index position
493  //! \param val Index position
495  return (*pathAccess[val]).first;
496  }
497 
498 
499  //!Get traceback_path at currently set index in multiTraceback
501  return (*pathAccess[vectorIterator]).first;
502  }
503 
504  //!Get the number times that traceback_path was recorded in multiple traceback
506  return (*pathAccess[vectorIterator]).second;
507  }
508 
509 
510  //!Add traceback_path to multiTraceback
511  //!\param path Traceback path to add
513  paths[path]++;
514  return;
515  }
516 
517 
518  //!Sorts the multiTraceback by the number of time a particular tracback path occurred
520  maxSize=paths.size();
521  vectorIterator=0;
522 
523  std::map<traceback_path,int>::iterator pathsIterator;
524  for(pathsIterator=paths.begin();pathsIterator!=paths.end();pathsIterator++){
525  pathAccess.push_back(pathsIterator);
526  }
527 
528  sort(pathAccess.begin(),pathAccess.end(),sortTBVec);
529  return;
530  }
531 
532  //!Generate a hit table from a multiple traceback paths
533  //! Hit table is 2D table describing how many times a state was called at a particular position in the sequence
535  if (table!=NULL){
536  delete table;
537  }
538 
539  //Over the lenght of the sequence
540  model* hmm = ((*pathAccess[0]).first).getModel();
541  size_t sequenceSize=((*pathAccess[0]).first).size();
542  size_t stateSize=hmm->state_size();
543 
544 
545  std::vector<int> states(stateSize,0);
546  table = new heatTable(sequenceSize,states);
547 
548  std::map<traceback_path,int>::iterator it;
549 
550  for( it =paths.begin(); it!=paths.end();it++){
551  int count = (*it).second;
552  for(size_t position=0;position<sequenceSize;position++){
553  int tbState=(*it).first[position];
554  (*table)[position][tbState]+=count;
555  }
556  }
557  return table;
558  }
559 
560 
562  if (table==NULL){
563  get_hit_table();
564  }
565 
566  std::string header_row = "Position";
567  model* hmm = ((*pathAccess[0]).first).getModel();
568  for (size_t state_iter =0; state_iter<hmm->state_size(); state_iter++){
569  header_row+="\t";
570  header_row+=hmm->getStateName(state_iter);
571  }
572 
573  std::cout << header_row << std::endl;
574 
575  for(size_t position = 0; position < table->size(); position++){
576  std::string line = join((*table)[position], '\t');
577  std::cout << position+1 << "\t" << line << std::endl;
578  }
579 
580  return;
581  }
582 
583 
585  this->finalize();
586  for(size_t iter=0; iter<this->size(); iter++){
587  std::cout << "Traceback occurred:\t " << (*pathAccess[iter]).second << std::endl;
588  (*pathAccess[iter]).first.print_path();
589  std::cout << std::endl;
590  }
591  return;
592  }
593 
595  this->finalize();
596  for(size_t iter=0; iter<this->size(); iter++){
597  std::cout << "Traceback occurred:\t " << (*pathAccess[iter]).second << std::endl;
598  (*pathAccess[iter]).first.print_label();
599  std::cout << std::endl;
600  }
601  return;
602  }
603 
604  void multiTraceback::print_gff(std::string& header){
605  this->finalize();
606  for(size_t iter=0; iter<this->size(); iter++){
607  std::cout << "Traceback occurred:\t " << (*pathAccess[iter]).second << std::endl;
608  (*pathAccess[iter]).first.print_gff(header);
609  std::cout << std::endl;
610  }
611  return;
612  }
613 
614 
615  bool sortTBVec(std::map<traceback_path,int>::iterator lhs, std::map<traceback_path,int>::iterator rhs)
616  {
617  return ((*lhs).second < (*rhs).second);
618  }
619 
620 }