StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
state.cpp
Go to the documentation of this file.
1 //state.cpp
2 //Copyright (c) 2007-2012 Paul C Lott
3 //University of California, Davis
4 //Genome and Biomedical Sciences Facility
5 //UC Davis Genome Center
6 //Ian Korf Lab
7 //Website: www.korflab.ucdavis.edu
8 //Email: lottpaul@gmail.com
9 //
10 //Permission is hereby granted, free of charge, to any person obtaining a copy of
11 //this software and associated documentation files (the "Software"), to deal in
12 //the Software without restriction, including without limitation the rights to
13 //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14 //the Software, and to permit persons to whom the Software is furnished to do so,
15 //subject to the following conditions:
16 //
17 //The above copyright notice and this permission notice shall be included in all
18 //copies or substantial portions of the Software.
19 //
20 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22 //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23 //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24 //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25 //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 
27 #include "state.h"
28 namespace StochHMM{
29 
30  //!Create a state object
31  state::state():endi(NULL), stateIterator(SIZE_MAX){
32  transi = new (std::nothrow) std::vector<transition*>;
33  }
34 
35  //!Create a state from string
36  //! Parses the string to create a state
37  //! \param txt String representation of a state
38  //! \param names Names of all the states
39  //! \param trcks Tracks defined for model
40  //! \param wts Pointer to all weight defined for the model
41  //! \param funcs State functions defined for the model
42  state::state(std::string& txt, stringList& names,tracks& trcks, weights* wts, StateFuncs* funcs):endi(NULL), stateIterator(SIZE_MAX){
43 
44  //endi=new transition(STANDARD);
45  transi = new std::vector<transition*>;
46  parse(txt,names,trcks, wts,funcs);
47  }
48 
50  delete transi;
51  transi=NULL;
52  }
53 
54 
55 
56  //state::state(int size){
57  // name="";
58  // gff="";
59  // label="";
60  // endi=new transitions(STANDARD);
61  // for (int i=0;i<size;i++){
62  // //trans.push_back(0);
63  // //log_trans.push_back(MIN);
64  // transi.push_back(initial);
65  // }
66  //}
67 
68 
69  //!Parses state from string
70  //! \param txt String of state definition
71  //! \param names StringList with names of other states. Used to identify position of transitions in transition
72  //! In future, may want to organize transitions in non-linear fashion
73  bool state::parse(std::string& txt, stringList& names,tracks& trks, weights* wts, StateFuncs* funcs){
74  size_t stateHeaderInfo = txt.find("STATE:");
75  size_t transitionsInfo = txt.find("TRANSITION:");
76  size_t emissionInfo = txt.find("EMISSION:");
77  stringList lines;
78 
79  if (stateHeaderInfo==std::string::npos){
80  std::cerr << "Couldn't identify state Header Information. Please check the formatting. The supplied text was : " << txt << std::endl;
81  return false;
82  }
83 
84  if (transitionsInfo == std::string::npos){
85  std::cerr << "Couldn't identify state Transitions Information. Please check the formatting. The supplied text was : " << txt << std::endl;
86  return false;
87  }
88 
89  //Extract and Parse Header Information
90  std::string header = txt.substr(stateHeaderInfo,transitionsInfo-stateHeaderInfo);
91  if (!_parseHeader(header)){
92  return false;
93  }
94 
95 
96  //Extract and Parse Transition Information
97  std::string trans = (emissionInfo==std::string::npos) ? txt.substr(transitionsInfo) : txt.substr(transitionsInfo, emissionInfo - transitionsInfo);
98  //std::cout << trans << std::endl;
99  if (!_parseTransition(trans, names, trks, wts, funcs)){
100  return false;
101  }
102 
103 
104  //Check emissions existence (only INIT state can have no emission)
105  if (emissionInfo != std::string::npos){
106  std::string emmis = txt.substr(emissionInfo);
107  if (!_parseEmission(emmis, names, trks, wts, funcs)){
108  std::cerr << "Couldn't parse the emissions for state: " << name << std::endl;
109  return false;
110  }
111  }
112  else{
113  //Check name is INIT. Only INIT state can not have an emission
114  if (name.compare("INIT")!=0){
115  std::cerr << "No emission defined for State: " << name << std::endl;
116  return false;
117  }
118  }
119 
120  return true;
121  }
122 
123  //!Parse the state header information and assign to class variables
124  //!Required header information includes then NAME and PATH_LABEL
125  //!Optonal header information includes GFF_DESC
126  //! \param txt String representation of state header
127  bool state::_parseHeader(std::string& txt){
128 
129  clear_whitespace(txt,"\t ");
130  stringList lst = splitString(txt,"\n:");
131 
132  size_t idx;
133 
134  //parse NAME (REQUIRED)
135  if (lst.contains("NAME")){
136  idx= lst.indexOf("NAME");
137  if (idx+1 < lst.size()){
138  idx++;
139  name=lst[idx];
140  }
141  else{
142  std::cerr << "The states NAME couldn't be parsed from " << txt << std::endl;
143  return false;
144  }
145 
146 
147  if (name.compare("INIT")==0){
148  return true;
149  }
150  }
151  else{
152  std::cerr << "No NAME found in state header information. Please check formatting. Following is header information recieved: " << txt << std::endl;
153  return false;
154  }
155 
156  //parse PATH_LABEL (REQUIRED)
157  if (lst.contains("PATH")){
158  idx= lst.indexOf("PATH");
159  if (idx+1 < lst.size()){
160  idx++;
161  label=lst[idx];
162  }
163  else{
164  std::cerr << "The states PATH_LABEL couldn't be parsed from " << txt << std::endl;
165  return false;
166  }
167 
168  }
169  else{
170  std::cerr << "No PATH_LABEL found in state header information. Please check formatting. Following is header information recieved: " << txt << std::endl;
171  return false;
172  }
173 
174  //Parse GFF_DESC (Optional)
175  if (lst.contains("GFF")){
176  idx= lst.indexOf("GFF");
177  if (idx+1 < lst.size()){
178  idx++;
179  gff=lst[idx];
180  }
181  else{
182  std::cerr << "The states GFF_DESC couldn't be parsed from " << txt << std::endl;
183  return false;
184  }
185  }
186 
187  return true;
188  }
189 
190 
191  //!Parses the transitions of the state from a text string
192  //! \param txt Text representation of the transitions
193  //! \param name StringList of all state names defined in the model
194  //! \param trks Reference to tracks for the model
195  //! \param wts Weight defined in the model
196  //! \param funcs State functions defined for the model
197  bool state::_parseTransition(std::string& txt, stringList& names, tracks& trks, weights* wts, StateFuncs* funcs){
198  //SPLIT UP TRANSITIONS AND APPLY SEPARATELY
199  stringList lst;
200  lst.splitND(txt,"TRANSITION:");
201 
202 
203  for(size_t iter=0;iter<lst.size();iter++){
204 
205  stringList line=splitString(lst[iter],"\n");
206  line.removeLWS("\t ");
207  line.removeComments();
208  //Line 1 defines transition type
209 
210  clear_whitespace(line[0],"\t");
211  stringList head=splitString(line[0],":");
212 
213  //DETERMINE TYPE
214  transType tp;
215  if (head.contains("STANDARD")){tp=STANDARD;}
216  else if (head.contains("DURATION")){tp=DURATION;}
217  else if (head.contains("LEXICAL")){tp=LEXICAL;}
218  else{
219  std::cerr << "Unrecognized Transition type(STANDARD, DURATION, LEXICAL):" << txt << std::endl;
220  return false;
221  //Error not recognized transition type
222  }
223 
224  //DETERMINE VALUE TYPE
225  valueType valtyp;
226  if (head.contains("P(X)")){valtyp=PROBABILITY;}
227  else if (head.contains("LOG")){valtyp=LOG_PROB;}
228  else if (head.contains("COUNTS")){valtyp=COUNTS;}
229  else {
230  std::cerr << "Unrecognized Transition value type( P(X), LOG, COUNTS): " << txt << std::endl;
231  return false;
232  //Error not recognized value type
233  }
234 
235 
236 
237  if (tp==STANDARD){
238  //Process each following from line
239  for(size_t iter=1;iter<line.size();iter++){
240  transition* temp = new(std::nothrow) transition(tp);
241 
242  if (temp==NULL){
243  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
244  exit(1);
245  }
246 
247  if (!temp->parse(line[iter], names, valtyp, trks, wts, funcs)){
248  std::cerr << "Couldn't parse Transition " << std::endl;
249  }
250 
251  std::string transName = temp->getName();
252 
253  if (transName=="END"){
254  endi = temp;
255  }
256  else{
257  //size_t idx = names.indexOf(transName);
258  //idx--;
259  addTransition(temp);
260  }
261  }
262 
263  }
264 
265  else if (tp==DURATION || tp==LEXICAL ){
266  transition* temp = new(std::nothrow) transition(tp);
267 
268  if (temp==NULL){
269  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
270  exit(1);
271  }
272 
273  if (!temp->parse(line,names,valtyp,trks,wts,funcs)){
274  std::cerr << "Couldn't parse Transition "<< std::endl;
275  return false;
276  }
277  std::string transName = temp->getName();
278 
279 
280  //size_t idx = names.indexOf(transName);
281  //idx--;
282  addTransition(temp);
283  }
284  }
285  return true;
286  }
287 
288 
289  //!Parses the emission for a state from a string
290  //! \param txt String representation of emissions
291  //! \param names stringList of all state names defined in the model
292  //! \param trks Tracks defined in the model
293  //! \param wts Weight defined of the model
294  //! \param funcs StateFunction defined for the model
295 
296  bool state::_parseEmission(std::string& txt, stringList& names, tracks& trks, weights* wts, StateFuncs* funcs){
297  stringList lst;
298  lst.splitND(txt,"EMISSION:");
299  //lst.print();
300 
301  for(size_t iter=0; iter<lst.size();iter++){
302  emm* temp = new(std::nothrow) emm;
303 
304  if (temp==NULL){
305  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
306  exit(1);
307  }
308 
309  if (!temp->parse(lst[iter],trks,wts,funcs)){
310  return false;
311  }
312  emission.push_back(temp);
313  }
314 
315  return true;
316  }
317 
318  //Print the string representation of the state to the stdout
319  void state::print(){
320  std::cout<< stringify() << std::endl;
321  }
322 
323 
324  //Get a string representation for the state
325  //! \return std::string
326  std::string state::stringify(){
327  std::string stateString;
328  stateString+="STATE:\n";
329  stateString+="\tNAME:\t" + name + "\n";
330 
331  if (name.compare("INIT")!=0){
332  if (!gff.empty()){ stateString+="\tGFF_DESC:\t" + gff + "\n";}
333  stateString+="\tPATH_LABEL:\t" + label + "\n";
334  }
335 
336  std::string standardString;
337  std::string distribString;
338  std::string lexicalString;
339 
340  //Get Transitions Standard, Distribution, Lexical
341  for(size_t i=0;i<transi->size();i++){
342  if ((*transi)[i]==NULL){ continue;}
343 
344  transType tp = (*transi)[i]->getTransitionType();
345  if (tp == STANDARD){
346  if (standardString.empty()){
347  standardString+="TRANSITION:\tSTANDARD:\tLOG\n";
348  }
349  standardString+=(*transi)[i]->stringify() + "\n";
350  }
351  else if (tp == DURATION){
352  distribString+="TRANSITION:\tDURATION:\tLOG\n";
353  distribString+=(*transi)[i]->stringify();
354 
355  }
356  else if (tp == LEXICAL){
357  if ((*transi)[i]->LexFunctionDefined()){
358  lexicalString+="TRANSITION:\tLEXICAL:\tFUNCTION:\t";
359  lexicalString+=(*transi)[i]->getLexicalFunctionName();
360  lexicalString+="\n";
361  }
362  else{
363  lexicalString+="TRANSITION:\tLEXICAL:\tLOG\n";
364  lexicalString+=(*transi)[i]->stringify();
365  }
366 
367  }
368  }
369 
370 
371  //Process Transition to Ending;
372  if (endi!=NULL){
373  if (standardString.empty()){
374  standardString+="TRANSITIONS:\tSTANDARD:\tLOG\n";
375  }
376  standardString+=endi->stringify();
377  }
378 
379 
380  if (name.compare("INIT")==0){
381  stateString+=standardString + "\n\n";
382  return stateString;
383  }
384  else{
385  if (!standardString.empty()){
386  stateString+=standardString + "\n";
387  }
388 
389  if (!distribString.empty()){
390  stateString+=distribString + "\n";
391  }
392 
393  if (!lexicalString.empty()){
394  stateString+=lexicalString + "\n";
395  }
396  }
397 
398 
399 
400  //Print Emissions
401  for(size_t i=0;i<emission.size();i++){
402  stateString+=emission[i]->stringify();
403  }
404 
405  return stateString;
406  }
407 
408  //! Get the emission value from a state at a particular position in the sequence
409  //! \param seqs Sequences the model is analyzing
410  //! \param iter Position in the sequence
411  double state::get_emission_prob(sequences &seqs, size_t iter){
412  double value(emission[0]->get_emission(seqs,iter));
413  for(size_t i=1;i<emission.size();i++){
414  value+=emission[i]->get_emission(seqs,iter); //Pass sequences type to get_emission for each emission in the state
415  }
416  return value;
417  }
418 
419 
420  //! Get the transition value from a state at a particular position in the sequence
421  //! \param seqs Sequences the model is analyzing
422  //! \param to State that transition is being calculated to
423  //! \param iter Position in the sequence
424  double state::get_transition_prob(sequences &seqs, size_t to, size_t iter){
425  double value;
426 
427  if ((*transi)[to]==NULL){
428  return -INFINITY;
429  }
430  else if ((*transi)[to]->transition_type==STANDARD){
431  value = (*transi)[to]->log_trans;
432  }
433  else{
434  std::cerr << "Need to implement this functionality" <<std::endl;
435  value = (*transi)[to]->getTransition(iter,&seqs);
436 
437  }
438  return value;
439  }
440 
441  //! Get the log probability transitioning to end from the state
443  if (endi==NULL){
444  return -INFINITY;
445  }
446  return endi->log_trans;
447  }
448 
449  //TODO: complete the checkLabels function
450  //! Checks the label tags for traceback and combine identifiers
451  void state::checkLabels(std::set<std::string>& labels, std::set<std::string>& gff, std::set<std::string>& name){
452  for(size_t i = 0;i<(*transi).size();i++){
453  transitionFuncParam* func = (*transi)[i]->getExtFunction();
454  if (func!=NULL){
455 
457  std::string tn = func->getTracebackName();
458 
459  //Traceback Identifier
460  if (tt != START_INIT && tt != DIFF_STATE){
461  if (tt == STATE_NAME){
462  if (!name.count(tn)){
463  std::cerr << "Model Function or Distribution traceback definition contains Unknown State Name:\t" << tn << std::endl;
464  }
465  }
466  else if (tt == STATE_GFF){
467  if (!gff.count(tn)){
468  std::cerr << "Model function or distribution traceback definition contains Unknown GFF Descrition:\t" << tn << std::endl;
469  }
470  }
471  else if (tt == STATE_LABEL){
472  if (!labels.count(tn)){
473  std::cerr << "Model function or distribution traceback definition contains Unknown Path Label:\t" << tn << std::endl;
474  }
475  }
476  }
477 
478 
479  combineIdentifier ct = func->getCombineType();
480  std::string cn = func->getCombineName();
481 
482  //Combine Identifiers
483  if (ct != FULL){
484  if (ct == STATENAME){
485  if (!name.count(cn)){
486  std::cerr << "Traceback Combine definition contains Unknown State Name:\t" << cn << std::endl;
487  }
488  }
489  else if (ct == STATEGFF){
490  if (!gff.count(cn)){
491  std::cerr << "Traceback Combine definition contains Unknown GFF Descrition:\t" << cn << std::endl;
492  }
493  }
494  else if (ct == STATELABEL){
495  if (!labels.count(cn)){
496  std::cerr << "Traceback Combine definition contains Unknown Path Label:\t" << cn << std::endl;
497  }
498  }
499  }
500 
501  }
502  }
503 
504  return;
505  }
506 
507 
508  /* On initial import of the states they are pushed on the transi vector in
509  the order written in model. However, the analysis requires that they be
510  in the particular position defined by state iterator.
511 
512  This function puts the transitions in the proper order for analysis
513  */
514  void state::_finalizeTransitions(std::map<std::string,state*>& state_index){
515 
516  //Get size # of states, but correct by -1 because
517  //initial state will be kept separate.
518  size_t number_of_states = state_index.size();
519  std::vector<transition*>* fixed_trans = new std::vector<transition*>(number_of_states-1,NULL);
520 
521  //Find the proper place for the transition and put it in the correct position
522  for(size_t i = 0; i < transi->size(); i++){
523  transition* temp = (*transi)[i];
524  std::string name = temp->getName();
525  state* st = state_index[name];
526  if (st == NULL){
527  std::cerr << "State: " << name << " was declared but not defined in the model." << std::endl;
528  exit(2);
529  }
530  size_t index = st->getIterator();
531  (*fixed_trans)[index]=temp;
532  (*transi)[i]=NULL;
533  }
534 
535  delete transi; //Don't need the old transition vector anymore
536  transi = fixed_trans;
537  return;
538  }
539 
540 
542  for(size_t i=0; i < emission.size() ; ++i){
543  if (emission[i]->isComplex()){
544  return true;
545  }
546  }
547  return false;
548  }
549 
550 
551 
553  for(size_t i=0;i<(*transi).size();++i){
554 
555  if ((*transi)[i]==NULL){
556  continue;
557  }
558 
559  if ((*transi)[i]->isComplex()){
560  return true;
561  }
562  }
563  return false;
564  }
565 
568  return true;
569  }
570  return false;
571  }
572 
573 }