StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
transitions.cpp
Go to the documentation of this file.
1 //transitions.cpp
2  //Copyright (c) 2007-2012 Paul C Lott
3  //University of California, Davis
4  //Genome and Biomedical Sciences Facility
5  //UC Davis Genome Center
6  //Ian Korf Lab
7  //Website: www.korflab.ucdavis.edu
8  //Email: lottpaul@gmail.com
9  //
10  //Permission is hereby granted, free of charge, to any person obtaining a copy of
11  //this software and associated documentation files (the "Software"), to deal in
12  //the Software without restriction, including without limitation the rights to
13  //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14  //the Software, and to permit persons to whom the Software is furnished to do so,
15  //subject to the following conditions:
16  //
17  //The above copyright notice and this permission notice shall be included in all
18  //copies or substantial portions of the Software.
19  //
20  //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22  //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23  //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24  //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25  //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 
27 #include "transitions.h"
28 namespace StochHMM{
29 
30  //!Create a transition of a certain type
31  //! \param type enum transType (STANDARD, DURATION, LEXICAL)
33  transition_type=type;
35  log_trans=-INFINITY;
36  extendedValue=-INFINITY;
37  function=false;
38  func = NULL;
39  lexFunc = NULL;
40  toState = NULL;
41  func = NULL;
42  }
43 
44  //!Create a transition of a certain type
45  //! \param type Type of transType
46  //! \param valType Type of Value Transition uses
47  //! \param Is distributions defined as survival
48  transition::transition(transType type, valueType valtyp, bool survival){
49  transition_type=type;
51  log_trans = -INFINITY;
52  func=NULL;
53  lexFunc=NULL;
54  function=false;
55  toState=NULL;
56  func = NULL;
57  }
58 
59  //! Parse the transition User and Standard Transition from String
60  //! to create a transition for the state
61  //! \param txt String representation of the transition
62  //! \param names StringList of all the states
63  //! \param valtyp Value type used (log, p(x)...)
64  //! \param trks Tracks of the model
65  //! \param wts Weight defined in the model
66  //! \param funcs State Functions created by the user
67  bool transition::parse(std::string& txt,stringList& names, valueType valtyp, tracks& trks, weights* wts , StateFuncs* funcs){
68 
69  if (!_processTags(txt,trks, wts, funcs)){
70  std::cerr << "Couldn't properly process tag from: "<< txt << std::endl;
71  return false;
72  }
73 
74  if (transition_type == STANDARD){
75  if (!_parseStandard(txt,names, valtyp)){
76  std::cerr << "Couldn't parse the Standard Transition" << std::endl;
77  }
78  }
79 
80  return true;
81  }
82 
83  //!Parse the Transition Lexical and User from String List
84  //! \param txt String representation of the transition used to create the transition from model file
85  //! \param names StringList of all the states
86  //! \param valtyp Value type used (log, p(x)...)
87  //! \param trks Tracks of the model
88  //! \param wts Weight defined in the model
89  //! \param funcs State Functions created by the user
90  bool transition::parse(stringList& txt,stringList& names, valueType valtyp, tracks& trks, weights* wts , StateFuncs* funcs){
91 
92  //txt.print();
93 
94  if (!_processTags(txt[1],trks, wts, funcs)){
95  std::cerr << "Couldn't properly process tag from: "<< txt[1] << std::endl;
96  return false;
97  }
98 
99  if (transition_type == DURATION ){
100  if (!_parseDuration(txt, names, valtyp)){
101  std::cerr << "Couldn't parse Duration Transition" <<std::endl;
102  return false;
103  }
104  }
105  else if (transition_type == LEXICAL){
106  if (!_parseLexical(txt, names, valtyp, trks, funcs)){
107  std::cerr << "Couldn't parse Lexical Transition" << std::endl;
108  return false;
109  }
110  }
111 
112  return true;
113  }
114 
115 
116  //TODO: Check for errors within TAG information
117 
118  //Process the addition tags after the transition
119  //Tags may describe functions or weights to apply to the transition
120  //Can also describe how a traceback is to be done and processed
121  bool transition::_processTags(std::string& txt, tracks& trks, weights* wts, StateFuncs* funcs){
122  stringList lst = extractTag(txt);
123 
124  if (lst.size() == 0){
125  return true;
126  }
127 
128  func=new(std::nothrow) transitionFuncParam();
129 
130  if (func==NULL){
131  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
132  exit(1);
133  }
134 
135  if (!func->parse(lst,trks, wts,funcs)){
136  std::cerr << "Couldn't parse Transition Tag information" << std::endl;
137  return false;
138  }
139 
140 
141  std::string trackName = func->getTrackName();
142 
143  //Does it contain the name
144 
145  return true;
146  }
147 
148  //Parse a simple standard transition from a model file
149  bool transition::_parseStandard(std::string& txt, stringList& names, valueType valtyp){
150 
151  //Process Transition
152  stringList line;
153  line.splitString(txt,"\t:");
154 
155  if (line.size()<2){
156  std::cerr << "Line should contain 2 values (STATE VALUE). Couldn't parse:\n" << txt << std::endl;
157  return false;
158  }
159 
160  stateName= line[0];
161 
162 
163  if (!names.contains(stateName) && stateName!="END"){
164  std::cerr << "Tried to create a transition in the model to state named : " << stateName << " but there is no state with that name. Please check the formatting. \n";
165  return false;
166  }
167 
168  if(valtyp==PROBABILITY){
169  double tempValue;
170 
171  if (!stringToDouble(line[1], tempValue)){
172  std::cerr << "Probability value not numeric: " << line[1] << std::endl;
173  return false;
174  }
175 
176  log_trans = log(tempValue);
177  }
178  else if (valtyp == LOG_PROB){
179 
180  double tempValue;
181 
182  if (!stringToDouble(line[1], tempValue)){
183  std::cerr << "Log Probability value not numeric: " << line[1] << std::endl;
184  return false;
185  }
186 
187  log_trans = tempValue;
188 
189  }
190 
191 
192  return true;
193  }
194 
195 
196 
197  //Parse the user-defined distribution from the model file
199  stringList line;
200  line.splitString(txt[1],"\t:,");
201 
202  if (line.size()<1){
203  std::cerr << "Couldn't parse the transition STATE information: "<< txt[1] << std::endl;
204  }
205 
206  stateName = line[0];
207 
208  //Determine Traceback options
209  if (line.contains("TO_LABEL") || line.contains("TB->LABEL")){
210  size_t idx= (line.contains("TO_LABEL")) ? line.indexOf("TO_LABEL") : line.indexOf("TB->LABEL");
211  idx++;
213  traceback_string = line[idx];
214  }
215  else if (line.contains("TO_GFF") || line.contains("TB->GFF")){
216  size_t idx= (line.contains("TO_GFF")) ? line.indexOf("TO_GFF") : line.indexOf("TB->GFF");
217  idx++;
219  traceback_string = line[idx];
220  }
221  else if (line.contains("TO_STATE") || line.contains("TB->STATE")){
222  size_t idx= (line.contains("TO_STATE")) ? line.indexOf("TO_STATE") : line.indexOf("TB->STATE");
223  idx++;
225  traceback_string = line[idx];
226  }
227  else if (line.contains("TO_START") || line.contains("TB->START")){
229  }
230  else{
232  }
233 
234  //Process Distribution
235  //from line 2 to end is distribution
236  distribution = new(std::nothrow) std::vector<double>;
237 
238  if (distribution==NULL){
239  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
240  exit(1);
241  }
242 
243  for(size_t j=2;j<txt.size();j++){
244 
245  stringList ln;
246  ln.splitString(txt[j],"\t");
247  if (ln.size()!=2){
248  std::cerr << "More than 2 values on the line: " << ln.stringify() << std::endl;
249  return false;
250  }
251 
252  size_t index(0);
253  if (!stringToInt(ln[0], index)){
254  std::cerr << "Distribution position value not numeric:\t" << ln[0] << std::endl;
255  return false;
256  }
257 
258  double value;
259  if (!stringToDouble(ln[1], value)){
260  std::cerr << "Distribution position value not numeric:\t" << ln[0] << std::endl;
261  return false;
262  }
263 
264  if (valtyp==PROBABILITY){
265  value=log(value);
266  }
267 
268 
269  //Fill the lower range up to index with the first value
270  //Extend values to lower
271  while (distribution->size()< index-1){
272  distribution->push_back(value);
273  }
274  #ifdef DEBUG_MODEL
275  //cout << value <<endl;
276  #endif
277  distribution->push_back(value);
278  }
279 
280  extendedValue = distribution->back();
281  return true;
282 
283  }
284 
285  //Parse the lexical transition from the model file
286  bool transition::_parseLexical(stringList& txt, stringList& names, valueType valtyp, tracks& trks, StateFuncs* funcs){
287 
288  //Process Transition
289  stringList line;
290  size_t idx(0);
291  std::string functionName("");
292 
293  if (txt.contains("FUNCTION")){
294  idx=txt.indexOf("FUNCTION");
295  function=true;
296 
297  if (idx+1 < txt.size()){
298  functionName=txt[idx+1];
299  }
300  else{
301  std::cerr << "Couldn't parse the function name from the Lexical Transition:\n" << txt.stringify() << std::endl;
302  }
303 
304  }
305 
306 
307  idx=txt.indexOf("TRANSITION");
308 
309  if (idx+1 < txt.size()){
310  idx++;
311  line.splitString(txt[idx],"\t:,");
312 
313  if (line.size()>0){
314  stateName = line[0];
315  }
316  else{
317  std::cerr << "Couldn't parse the STATE for the Lexical transition:\n" << txt.stringify() << std::endl;
318  }
319  }
320  else{
321  std::cerr << "Couldn't parse the Lexical Transition:\n" << txt.stringify() << std::endl;
322  return false;
323  }
324 
325 
326  if (!names.contains(stateName)){
327  std::cerr << "Lexical transition defined transition to state named : " << stateName << " However, there doesn't appear to be any state with that name\n";
328  return false;
329  }
330 
331  //remaining tracks and Orders then set Track
332  std::vector<track*> tempTracks;
333  for(size_t i=1;i<line.size();i++){
334  track* tk = trks.getTrack(line[i]);
335  if (tk==NULL){
336  std::cerr << "Lexical Transition tried to add a track named: " << line[i] << " . However, there isn't a matching track in the model. Please check to model formatting.\n";
337  return false;
338  }
339  else{
340  tempTracks.push_back(tk);
341  }
342  }
343 
344 
345  if (function){ //Lexical Function
346  lexFunc=new(std::nothrow) emissionFuncParam(functionName,funcs,tempTracks[0]);
347 
348  if (lexFunc==NULL){
349  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
350  exit(1);
351  }
352  }
353 
354  else{ //Lexical Tables
355  //Get Order Information
356  std::vector<int> tempOrder;
357  if (txt.contains("ORDER")){
358  idx=txt.indexOf("ORDER");
359  }
360  else{
361  std::cerr << "Unable to locate ORDER: for lexical transition to State Name: " << stateName << std::endl;
362  return false;
363  }
364  line.splitString(txt[idx],"\t:,");
365 
366  size_t ambIdx(0);
367  bool containsAmbig=line.contains("AMBIGUOUS");
368  if (containsAmbig){
369  ambIdx=line.indexOf("AMBIGUOUS");
370 
371  for(size_t i=1;i<ambIdx;i++){
372 
373  int tempInt;
374  if (!stringToInt(line[i], tempInt)){
375  std::cerr << "Lexical Transition Order is not numeric" << line[i] << std::endl;
376  return false;
377  }
378 
379  tempOrder.push_back(tempInt);
380  }
381  }
382  else{
383  for(size_t i=1;i<line.size();i++){
384 
385  int tempInt;
386  if (!stringToInt(line[i], tempInt)){
387  std::cerr << "Lexical Transition Order is not numeric" << line[i] << std::endl;
388  return false;
389  }
390 
391  tempOrder.push_back(tempInt);
392  }
393  }
394 
395 
396 
397  if (tempOrder.size() == tempTracks.size()){
398  for(size_t i=0;i<tempOrder.size();i++){
399  scoreTable.addTrack(tempTracks[i], tempOrder[i]);
400  }
401  }
402  else{
403  std::cerr << "Different number of tracks and orders parsed in LEXICAL TRANSITION to state: " << stateName << " Check the formatting of the Lexical Transition" << std::endl;
404  return false;
405  }
406 
407  //Parse Ambiguous Tag Info
408  if (containsAmbig){
409  ambIdx++;
410  if (line.size()<=ambIdx){
411  std::cerr << "No scoring type after AMBIGUOUS label\nAssuming AVG\n";
413  }
414  else if (line[ambIdx].compare("AVG")==0){scoreTable.setUnkScoreType(AVERAGE_SCORE);}
415  else if (line[ambIdx].compare("MAX")==0){scoreTable.setUnkScoreType(HIGHEST_SCORE);}
416  else if (line[ambIdx].compare("MIN")==0){scoreTable.setUnkScoreType(LOWEST_SCORE);}
417  else if (line[ambIdx].compare("P(X)")==0)
418  {
420  ambIdx++;
421 
422  double tempValue;
423  if (!stringToDouble(line[ambIdx], tempValue)){
424  std::cerr << "Ambiguous Value couldn't be parsed: "<< line[ambIdx] << std::endl;
425  return false;
426  }
427 
428  scoreTable.setUnkScore(log(tempValue));
429  }
430  else if (line[ambIdx].compare("LOG")==0){
432  ambIdx++;
433 
434  double tempValue;
435  if (!stringToDouble(line[ambIdx], tempValue)){
436  std::cerr << "Ambiguous Value couldn't be parsed: "<< line[ambIdx] << std::endl;
437  return false;
438  }
439 
440  scoreTable.setUnkScore(tempValue);
441  }
442  }
443 
444 
445 
446  //Get Tables
447  size_t expectedColumns(1);
448  size_t expectedRows(1);
449  for(size_t i = 0; i<scoreTable.getNumberOfAlphabets(); i++){
450  expectedColumns*=scoreTable.getAlphaSize(i);
451  expectedRows*=POWER[scoreTable.getOrder(i)][scoreTable.getAlphaSize(i)-1];
452  }
453 
454  std::vector<std::vector<double> >* log_prob = scoreTable.getLogProbabilityTable();
455  std::vector<std::vector<double> >* prob = scoreTable.getProbabilityTable();
456  std::vector<std::vector<double> >* counts = scoreTable.getCountsTable();
457 
458 
459  idx++;
460 
461  for (size_t iter = idx; iter< txt.size();iter++){
462 
463  //If it's the first line check for a '#' indicating that the column header is present
464  if (iter==idx && txt[idx][0]=='@'){
465  continue;
466  }
467 
468  line.splitString(txt[iter],"\t ");
469 
470  //Check for Row header
471  if (line[0][0]=='@'){
472  line.pop_ith(0);
473  }
474 
475  //line.splitString(txt[iter],"\t ");
476  std::vector<double> temp = line.toVecDouble();
477  if (temp.size() != expectedColumns){
478  std::cerr << "The following line couldn't be parsed into the required number of columns. Expected Columns: " << expectedColumns << "\n The line appears as: " << txt[iter] << std::endl;
479  return false;
480  }
481  else{
482  if (valtyp == PROBABILITY){
483  prob->push_back(temp);
484  logVector(temp);
485  log_prob->push_back(temp);
486  }
487  else if (valtyp == LOG_PROB){
488  log_prob->push_back(temp);
489  expVector(temp);
490  prob->push_back(temp);
491  }
492  else if (valtyp == COUNTS){
493  counts->push_back(temp);
494  probVector(temp);
495  prob->push_back(temp);
496  logVector(temp);
497  log_prob->push_back(temp);
498  }
499 
500  }
501  }
502 
503  if (log_prob->size() != expectedRows){
504  std::cerr << " The Lexical table doesn't contain enough rows. Expected Rows: " << expectedRows << " \n Please check the Lexical Table and formatting\n";
505  return false;
506  }
507 
508  }
509 
510  return true;
511  }
512 
513 
514  //!Print the transition to stdout
516  std::cout << stringify() << std::endl;
517  }
518 
519  //!Convert the transition to a string representation as shown in the model file
520  //! \return std::string Representation of the transition
521  std::string transition::stringify(){
522  std::string transString;
523  if (transition_type == STANDARD){
524  transString+="\t" + stateName + ":\t" + double_to_string(log_trans);
525  if (func!=NULL){
526  transString+="\t" + func->stringify();
527  }
528  }
529  else if (transition_type == DURATION){
530  transString+="\t" + stateName + ":\t" ;
531  transString+= (traceback_identifier==STATE_NAME) ? "TO_STATE:\t" + traceback_string :
532  (traceback_identifier==STATE_LABEL) ? "TO_LABEL:\t" + traceback_string :
533  (traceback_identifier==STATE_GFF) ? "TO_GFF:\t" + traceback_string : "DIFF_STATE" ;
534  if (func!=NULL){
535  transString+="\t" + func->stringify();
536  }
537 
538  transString+="\n";
539  for(size_t i=0;i<distribution->size();i++){
540  transString+= "\t\t" + int_to_string((int) i+1) + "\t" + double_to_string((*distribution)[i]) + "\n";
541  }
542  }
543  else if (transition_type == LEXICAL){
544  transString+="\t" + stateName + ":\t";
545 
546 
547  if (function){
548  transString+=lexFunc->getTrackName();
549  if (func!=NULL){
550  transString+="\t" + func->stringify();
551  }
552  }
553  else{
554  for(size_t i=0;i<scoreTable.trackSize();i++){
555  if (i>0){
556  transString+=",";
557  }
558  transString+=scoreTable.getTrack(i)->getName();
559  }
560 
561 
562  if (func!=NULL){
563  transString+="\t" + func->stringify();
564  }
565 
566  transString+="\n\t\tORDER:\t";
567 
568  for(size_t i=0;i<scoreTable.trackSize();i++){
569  if (i>0){
570  transString+=",";
571  }
572  transString+=int_to_string(scoreTable.getOrder(i));
573  }
574 
576 
577  if (ambtemp!=NO_SCORE){
578  transString+="\tAMBIGUOUS:\t";
579  transString+=(ambtemp==HIGHEST_SCORE)? "MAX":
580  (ambtemp==LOWEST_SCORE)? "MIN":
581  (ambtemp==AVERAGE_SCORE)? "AVG": "LOG:" + double_to_string(scoreTable.getAmbDefinedScore());
582  }
583 
584  transString+="\n";
585 
586 
587  //Print the probability table
588  //Output Column Headers
589  transString+=scoreTable.stringify();
590 
591  }
592  }
593 
594  return transString;
595  }
596 
597  //!Calculate the transition probability for a transition at a particular position in the sequence
598  //! \param pos Position in sequences that we are determining the transition
599  //! \param seqs Pointer to sequences, used in determining transition
600  //! \return double Score of the transition
601  double transition::getTransition(size_t pos,sequences* seqs){
603  return log_trans;
604  }
605  else if (transition_type==DURATION){
606 
607  if (pos >= distribution->size()){ // Check that size corresponds to zero index or one index.....
608  return extendedValue;
609  }
610  else{
611  return (*distribution)[pos-1];
612  }
613  }
614  else if (transition_type==LEXICAL){
615 
616  double value;
617 
618  if (function){
619  value = lexFunc->evaluate(*seqs, pos);
620  }
621  else{
622  value = scoreTable.getValue(*seqs,pos);
623  }
624  return value;
625  }
626  else{
627  return -INFINITY;
628  }
629  }
630 
631 }