StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
track.cpp
Go to the documentation of this file.
1 //track.cpp
2 //Copyright (c) 2007-2012 Paul C Lott
3 //University of California, Davis
4 //Genome and Biomedical Sciences Facility
5 //UC Davis Genome Center
6 //Ian Korf Lab
7 //Website: www.korflab.ucdavis.edu
8 //Email: lottpaul@gmail.com
9 //
10 //Permission is hereby granted, free of charge, to any person obtaining a copy of
11 //this software and associated documentation files (the "Software"), to deal in
12 //the Software without restriction, including without limitation the rights to
13 //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
14 //the Software, and to permit persons to whom the Software is furnished to do so,
15 //subject to the following conditions:
16 //
17 //The above copyright notice and this permission notice shall be included in all
18 //copies or substantial portions of the Software.
19 //
20 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
22 //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
23 //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
24 //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25 //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 
27 #include "track.h"
28 namespace StochHMM{
29 
30  // FIXME: Check if ambiguous before allowing to get character.
31  //!Create an ambiguous character
32  //! For example in DNA N = [ACGT] = [0,1,2,3]
33  //! \param tr Track to use for ambiguity code
34  //! \param ambChar String representation of the character/symbol/word for ambiguous character
35  //! \param defs vector of strings where each string is a non-ambiguous character defined in the track
36  ambigCharacter::ambigCharacter(track* tr, std::string& ambChar, std::vector<std::string>& defs){
37  symbol=ambChar;
38  for(size_t i=0;i<defs.size();i++){
39  setDefinition.push_back(tr->symbolIndex(defs[i]));
40  }
41  sort(setDefinition.begin(),setDefinition.end());
42  return;
43  }
44 
45 
46  //!Create a track
50  ambiguous=false;
52  trackFunctionDefined= false;
53  maxSize=0;
54  max_ambiguous =0;
55  max_unambiguous =0;
56  complementSet=false;
57  charIndices = NULL;
58  }
59 
60  track::track(std::vector<std::string>& characters){
61 
63  ambiguous=false;
65  trackFunctionDefined= false;
66  maxSize=0;
67  max_ambiguous =0;
68  max_unambiguous =0;
69  complementSet=false;
70  charIndices = NULL;
71 
72  addAlphabetChar(characters);
74  }
75 
76  //!Get the letter/word that has a given digitized value
77  //! \param iter integer value of digital symbol
78  //! \return std::string The string value in the undigitized sequence that is associated with the integer digitized value;
79  std::string track::getAlpha(int iter){
80  if (iter<=max_unambiguous){
81  return alphabet[iter];
82  }
83  else if (iter <= max_ambiguous){
84  return getAmbiguousCharacter(iter);
85  }
86  else{
87  return "";
88  }
89  }
90 
91  //!Get the letter/word that has a given digitized value
92  //! \param iter integer value of digital symbol
93  //! \return std::string The string value in the undigitized sequence that is associated with the integer digitized value;
94  std::string track::getAlpha(size_t iter){
95  if (iter<=max_unambiguous){
96  return alphabet[iter];
97  }
98  else if (iter <= max_ambiguous){
99  return getAmbiguousCharacter(iter);
100  }
101  else{
102  return "";
103  }
104  }
105 
106  //FIXME: Have it check before adding value
107  //! Add a letter/word symbol to the track
108  //! \param character Word or symbol used in undigitized sequence
109  bool track::addAlphabetChar(std::string& character){
110 
111  if (alphabet.size() >= 255){
112  std::cerr << "Alphabet limit reached. Unable to add additional characters to the track:\t" << character << std::endl;
113  return false;
114  }
115 
116 
117  if (character.size()>maxSize){maxSize=character.size();};
118 
119  alphabet.push_back(character);
120 
121  size_t index = alphabet.size()-1;
122 
123  symbolIndices[character]=index;
124 
125  max_unambiguous = index;
126  unambiguous.push_back(index);
128 
129  return true;
130  }
131 
132  bool track::addAlphabetChar(const char *character){
133  std::string string_character(character);
135 
136  if (string_character.size()>maxSize){maxSize=string_character.size();};
137 
138  return addAlphabetChar(string_character);
139  }
140 
141  bool track::addAlphabetChar(std::vector<std::string>& characters){
142  for(size_t i=0;i<characters.size();i++){
143  addAlphabetChar(characters[i]);
144  }
145 
147 
148  return true;
149  }
150 
151 
152  bool track::addAlphabetChar(size_t chSize, const char * characters[]){
153  for(size_t i =0; i < chSize; i++ ){
154  addAlphabetChar(characters[i]);
155  }
156  return true;
157  }
158 
159 
160  bool track::addAlphabetChar(std::string& character, std::string& complement){
161  addAlphabetChar(character);
162  addComplement(character, complement);
163  complementSet = true;
165  return true;
166  }
167 
168  bool track::addAlphabetChar(size_t chSize, const char* characters[], const char* complements[]){
169  for(size_t i=0;i<chSize;i++){
170  addAlphabetChar(characters[i]);
171  addComplement(characters[i], complements[i]);
172  }
173 
174  complementSet = true;
176 
177  return true;
178  }
179 
180 
181  //FIXME: Need to fix the code below and test
182  //Complements added by Ken
183  bool track::addAlphabetChar(std::vector<std::string>& characters, std::vector<std::string>& complements){
184 
185  if (characters.size() != complements.size()){
186  //Error Message
187  std::cerr << "Number of Complement characters and Characters don't match.\n";
188  return false;
189  }
190 
191 
192  for(size_t i=0;i<characters.size();i++){
193  addAlphabetChar(characters[i]);
194  addComplement(characters[i], complements[i]);
195  }
196 
197  complementSet = true;
199 
200  return true;
201  }
202 
203  void track::addComplement(std::string& character, std::string& complement){
204  int index = symbolIndex(character);
205  int comp = symbolIndex(complement);
206  complementAlphabet[index]=comp;
207  complementSet = true;
208 
209  return;
210  }
211 
212  void track::addComplement(const char *character, const char *complement) {
213  std::string string_character(character);
214  std::string string_complement(complement);
215  addComplement(string_character, string_complement);
216 
217  return;
218  }
219 
220  bool track::addComplement(std::vector<std::string>& characters, std::vector<std::string>& complements){
221 
222  if (characters.size() != complements.size()){
223  std::cerr << "Number of Complement characters and Characters don't match.\n";
224  return false;
225  }
226 
227  for(size_t i=0;i<characters.size();i++){
228  addComplement(characters[i], complements[i]);
229  }
230 
231  return true;
232  }
233 
234 
235  //!Add an ambiguous character/word definition to the track
236  //! \param ambChar word/symbol fore the ambiguous character
237  void track::addAmbiguous(std::string& ambChar, std::vector<std::string>& defs){
238  if (defaultAmbiguous==-1){
240  }
241  ambigCharacter amb(this,ambChar,defs);
242  ambiguousSymbols.push_back(amb);
243 
244  int index = (int) symbolIndices.size(); //Get the index position and new digital reference value
245 
246  if (index >= 255){
247  std::cerr << "Maximum number of discrete symbols reached at 255\n";
248  exit(2);
249  }
250 
251  symbolIndices[ambChar]=index;
252  max_ambiguous = index;
253  return;
254  }
255 
256 
257  //! Get symbol assigned integer value
258  //! \param symbol word/letter/symbol that we want to get it's assigned integer value
259  uint8_t track::symbolIndex(const std::string& symbol){
260  if (symbolIndices.count(symbol)==0){ //If isn't found in the hash
261  if (ambiguous){ //Return default character if ambiguous is set
262  std::cerr << symbol << "not found in HMM definitions. Using default ambiguous character.\n";
263  return defaultAmbiguous;
264  }
265  else{
266  std::cerr << "Encountered an ambiguous character in the sequence. No ambiguous characters are allowed because they weren't set in the model. To allow ambiguous characters, please add an \" Ambiguous Character Definition\" to the model" << std::endl;
267  exit(1);
268  }
269  }
270  else{
271  return symbolIndices[symbol];
272  }
273  }
274 
275  //! Get symbol assigned integer value
276  //! \param symbol word/letter/symbol that we want to get it's assigned integer value
277  uint8_t track::symbolIndex(unsigned char symbol){
278 
279  if (maxSize != 1){
280  std::cerr << "Track Max Symbols Size:\t" << maxSize << "\t Must use function track::symbolIndex(const std::string& symbol)\n";
281  }
282 
283  if (charIndices == NULL){
284  charIndices = new (std::nothrow) std::vector<uint8_t>(255,255);
285  for(std::map<std::string,uint8_t>::iterator it = symbolIndices.begin(); it != symbolIndices.end(); it++){
286  (*charIndices)[(it->first)[0]] = it->second;
287  }
288  }
289 
290 
291  if ((*charIndices)[symbol]==255){ //If isn't found in the array
292  if (ambiguous){ //Return default character if ambiguous is set
293  return defaultAmbiguous;
294  }
295  else{
296  std::cerr << "Encountered an ambiguous character in the sequence. No ambiguous characters are allowed because they weren't set in the model. To allow ambiguous characters, please add an \" Ambiguous Character Definition\" to the model" << std::endl;
297  exit(1);
298  }
299  }
300  else{
301  return (*charIndices)[symbol];
302  }
303  }
304 
305 
306  //FIXME: Change return value so only returns true if parse is OK
307  //! Parse a string representation of track to define a tracks parameters
308  //! \param txt Line from model that describes a track
309  //! \return true if the track was parsed properly
310  bool track::parse(std::string& txt){
311  stringList lst;
312  stringList tag = extractTag(txt);
313  size_t index;
314 
315  lst.fromNext(txt);
316  setName(lst[0]);
317  setDescription(lst.getComment());
318 
319  if (lst[1].compare("REAL_NUMBER")==0){
321  if (tag.size()>0){
322  if (tag.contains("FUNCTION")){
323  index=tag.indexOf("FUNCTION");
324  trackFunction=tag[index+1];
325  }
326  else{
327  std::cerr << "Real number track function tag must contain FUNCTION: and USE: . Please check the formatting of your tag. Here is the tag as parsed: " << tag.stringify() << std::endl;
328  return false;
329 
330  }
331 
332  if (tag.contains("USE")){
333  index=tag.indexOf("USE");
334  trackToUse=tag[index+1];
335  }
336  else{
337  std::cerr << "Real number track tag must contain FUNCTION: and USE: . Please check the formatting of your tag. Here is the tag as parsed: " << tag.stringify() << std::endl;
338  return false;
339  }
341  }
342  }
343  else{
345 
346  for(size_t i=1;i<lst.size();i++){
347  if (!addAlphabetChar(lst[i])){
348  std::cerr << "Track import failed, because number of symbols exceeded 255. Alternatively, you can create a real number track for different emissions" << std::endl;
349  return false;
350  }
351 
352  }
353  }
354  return true;
355  }
356 
357 
358  //! Get the string representation of the track
359  //! \return std::string Definition of the track as in model
360  std::string track::stringify(){
361  std::string output;
362  output+=name + ":\t";
363 
364  if (alpha_type == ALPHA_NUM){
365  output+=join(alphabet,',');
366  }
367  else{
368  output+="REAL_NUMBER";
370  output+="\t[FUNCTION:\t" + trackFunction;
371  output+="\tUSE:\t" + trackToUse + "]";
372  }
373  }
374  output+="\n";
375 
376  return output;
377  }
378 
379  //!Get the string representation of the ambiguous character definitions as in model file
380  //! \return std::string
381  std::string track::stringifyAmbig(){
382  std::string output;
383  output+=name + ":\t";
384  for (size_t i = max_unambiguous+1; i <= max_ambiguous; i++){
385  if (i > (size_t) max_unambiguous+1){ output+= ",";}
386 
387  output+=getAmbiguousCharacter(i);
388  output+="[";
389 
390  std::vector<size_t>& regChar = getAmbiguousSet(i);
391  for(size_t k = 0; k<regChar.size();k++){
392  if (k>0){output+=",";}
393  output+=alphabet[regChar[k]];
394  }
395  output+="]";
396  }
397  return output;
398  }
399 
400 
401 
402  std::string track::convertIndexToWord(size_t wordIndex, size_t order){
403  std::string output="";
404 
405  if (order == 0){
406  return "";
407  }
408 
409  size_t currentOrder = order;
410 
411  while (currentOrder>1){
412  double dreg=POWER[currentOrder-1][alphabet.size()-1];
413  size_t temp = floor ((double) wordIndex / dreg);
414  output+=alphabet[temp];
415  if (maxSize!=1){
416  output += ",";
417  }
418 
419  wordIndex-=temp*dreg;
420  currentOrder--;
421  }
422 
423  output+=alphabet[wordIndex];
424 
425  return output;
426  }
427 
428 
429  void track::convertIndexToDigital(size_t wordIndex, size_t order, uint8_t word[]){
430  if (order == 0){
431  word[0]=wordIndex;
432  return;
433  }
434 
435  std::cout << alphabet.size() << std::endl;
436 
437  size_t currentOrder = order;
438 
439  while (currentOrder>1){
440  double dreg=POWER[currentOrder-1][alphabet.size()-1];
441  size_t temp = floor ((double) wordIndex / dreg);
442  word[currentOrder-1] = temp;
443  wordIndex-=temp*dreg;
444  currentOrder--;
445  }
446 
447  word[0] = wordIndex;
448  return;
449  }
450 
451 
452  //FIXME: Change return value so only returns true if parse is OK
453  //! Parse the ambiguous character definitions from model file
454  //! \param txt String representation of ambiguous character definition as in model file
455  //! \return true if the ambiguous characters were properly parsed
456  bool track::parseAmbiguous(std::string& txt){
457  std::vector<std::pair<std::string,std::vector<std::string> > > temp;
458 
459  _splitAmbiguousList(temp, txt);
460  if (temp.size()==0){
461  return false;
462  }
463  setAmbiguous();
464  for (size_t i=0;i<temp.size();i++){
465  addAmbiguous(temp[i].first, temp[i].second);
466  }
467 
468  return true;
469  }
470 
471  void track::_splitAmbiguousList(std::vector<std::pair<std::string,std::vector<std::string> > >& results, const std::string& text){
472 
473  size_t opening;
474  size_t closing;
475  size_t start=0;
476 
477  opening=text.find_first_of('[');
478  while(opening!=std::string::npos){
479  std::pair<std::string,std::vector<std::string> > amb;
480  amb.first=text.substr(start,opening-start);
481  clear_whitespace(amb.first, "\t ");
482 
483  closing=text.find_first_of(']',opening);
484  if (closing!=std::string::npos){
485  std::string tempString=text.substr(opening+1,closing-opening-1);
486  split_line(amb.second, tempString);
487  }
488  start=text.find_first_not_of(',',closing+1);
489  results.push_back(amb);
490  opening=text.find_first_of('[',closing);
491  }
492 
493  return;
494  }
495 
496 
497  //! Get the string representation of the ambigous character defined by integer value
498  //! If ambiguous character isn't defined, return value is "*"
499  //! \param val Integer value representing the ambiguous character
500  std::string track::getAmbiguousCharacter(size_t val){
501  if (getAmbiguousSize()==0){
502  return "*";
503  }
504 
505  return ambiguousSymbols[(val-max_unambiguous)-1].getSymbol();
506  }
507 
508 
509  //! Add track to tracks container
510  //! \param tk Pointer to track to be added
512  std::string& name= tk->name;
513 
514  if (!index.count(name)){
515  index[tk->getName()]=trks.size();
516  trks.push_back(tk);
517  }
518  else{
519  std::cerr << "Track with name: " << name << " already exists. Cannot add tracks with the same name\n";
520  exit(1);
521  }
522 
523  }
524 
525  //!Get iterator index of track by tracks name
526  //! \param name Name of the track
527  //! \return size_t Iterator to track within the tracks
528  //! \return -1 if track doesn't exist in tracks
529  size_t tracks::indexOf(const std::string& name){
530  if (index.count(name)){
531  return index[name];
532  }
533  else{
534  return SIZE_MAX;
535  }
536  }
537 
538 
539  //!Get pointer to track from the track name
540  //! \param name Name of the track
541  //! \return pointer to track if it exists, NULL otherwise
542  track* tracks::getTrack(const std::string& name){
543  if (index.count(name)){
544  return trks[index[name]];
545  }
546  else{
547  return NULL;
548  }
549  }
550 
551  bool tracks::isTrackDefined(const std::string& name){
552  if (index.count(name)){
553  return true;
554  }
555 
556  return false;
557  }
558 
559  //!Print the each track in tracks to stdout
561  std::cout << stringify() << std::endl;
562  }
563 
564  //! Get string representation of each track in tracks
565  //! \return std::string String representation of tracks as in model file
566  std::string tracks::stringify(){
567  std::string trackString;
568  std::string ambigString;
569  std::string lnSep(50,'=');
570  trackString+="TRACK SYMBOL DEFINITIONS\n" + lnSep + "\n";
571 
572  for(size_t i=0;i<trks.size();i++){
573  trackString+=trks[i]->stringify();
574  if (trks[i]->isAmbiguousSet()){
575  ambigString+=trks[i]->stringifyAmbig();
576  }
577  }
578  if (!ambigString.empty()){
579  ambigString="AMBIGUOUS SYMBOL DEFINITIONS\n" + lnSep + "\n"+ ambigString;
580  trackString+="\n" + ambigString + "\n";
581  }
582  else{
583  trackString+="\n";
584  }
585 
586  return trackString;
587  }
588 
589 
590  //! Get the complement alphabet character digitized value given a value
591  //! \param val Value of character to get complement of
592  //! \return int value of complement
593  uint8_t track::getComplementIndex(uint8_t val){
594  if (complementSet){
595  if (complementAlphabet.count(val)){
596  return complementAlphabet[val];
597  }
598  else{
599  std::cerr<< "Complement of " << val << " is not set in track\n";
600  return -1;
601  }
602  }
603  else{
604  std::cerr << "No complements are set in the track\n";
605  exit(1);
606  }
607  }
608 
609 
610 
611  //! Get the complement alphabet character digitized value given the string
612  //! \param character String of alphanumerical symbol
613  //! \return int Defined complement string symbol of symbol
614  uint8_t track::getComplementIndex(std::string& character){
615  if (complementSet){
616  if (symbolIndices.count(character)){
617  int characterIndex = symbolIndex(character);
618  return complementAlphabet[characterIndex];
619  }
620  else{
621  std::cerr<< "Complement of " << character << " is not set in track\n";
622  return -1;
623  }
624  }
625  else{
626  std::cerr << "No complements are set in the track\n";
627  exit(1);
628  }
629  }
630 
631 
632  //! Get the complement alphanumerical string of a given integer value;
633  //! \param value Integer value of a symbol
634  //! \return std::string Defined complement string symbol of symbol with digitized value
635  std::string track::getComplementSymbol(uint8_t value){
636  if (complementSet){
637  if (complementAlphabet.count(value)){
638  int complement_value = complementAlphabet[value];
639  return getAlpha(complement_value);
640  }
641  else{
642  std::cerr<< "Complement of " << value << " is not set in track\n";
643  return " ";
644  }
645  }
646  else{
647  std::cerr << "No complements are set in the track\n";
648  exit(1);
649  }
650 
651  }
652 
653 
654  //! Get the compelment alphabet character digitized value given the string
655  //! \param character String of alphanumerical symbol
656  //! \return std::string Defined complement string symbol
657  std::string track::getComplementSymbol(std::string& character){
658  if (complementSet){
659  if (symbolIndices.count(character)){
660  int characterIndex = symbolIndex(character);
661  int complement_value = complementAlphabet[characterIndex];
662  return getAlpha(complement_value);
663  }
664  else{
665  std::cerr<< "Complement of " << character << " is not set in track\n";
666  return " ";
667  }
668  }
669  else{
670  std::cerr << "No complements are set in the track\n";
671  exit(1);
672  }
673  }
674 
675 
676 }