StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence.cpp
Go to the documentation of this file.
1 //
2 // sequence.cpp
3 //Copyright (c) 2007-2012 Paul C Lott
4 //University of California, Davis
5 //Genome and Biomedical Sciences Facility
6 //UC Davis Genome Center
7 //Ian Korf Lab
8 //Website: www.korflab.ucdavis.edu
9 //Email: lottpaul@gmail.com
10 //
11 //Permission is hereby granted, free of charge, to any person obtaining a copy of
12 //this software and associated documentation files (the "Software"), to deal in
13 //the Software without restriction, including without limitation the rights to
14 //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
15 //the Software, and to permit persons to whom the Software is furnished to do so,
16 //subject to the following conditions:
17 //
18 //The above copyright notice and this permission notice shall be included in all
19 //copies or substantial portions of the Software.
20 //
21 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
23 //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
24 //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
25 //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
26 //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 
28 #include "sequence.h"
29 
30 namespace StochHMM{
31 
32 
33  //!Create a sequence datatype
35  realSeq=false;
36  real = NULL;
37  seq=NULL;
38  mask=NULL;
39  seqtrk = NULL;
40  length = 0;
41  max_mask=-1;
42  external= NULL;
43  attrib = -INFINITY;
44  }
45 
46  //!Create a sequence data type
47  //! \param realTrack True if the sequence is a list of real numbers
48  sequence::sequence(bool realTrack):mask(NULL){
49  if (realTrack){
50  real=new(std::nothrow) std::vector<double>;
51 
52  if (real==NULL){
53  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
54  exit(1);
55  }
56 
57  seq=NULL;
58  }
59  else{
60  real=NULL;
61  seq=new(std::nothrow) std::vector<uint8_t>;
62 
63  if (seq==NULL){
64  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
65  exit(1);
66  }
67  }
68  realSeq = realTrack;
69  seqtrk = NULL;
70  length = 0;
71  max_mask=-1;
72  external= NULL;
73  attrib = -INFINITY;
74  }
75 
76  //! \brief Create a sequence typ
77  //! \param vec Vector of doubles to used as the real numbers for the sequence
78  //! \param tr Pointer to the track to be used
79  sequence::sequence(std::vector<double>* vec ,track* tr):mask(NULL){
80  realSeq = true;
81  seqtrk = tr;
82  real = vec;
83  external= NULL;
84  attrib = -INFINITY;
85  length = vec->size();
86  max_mask=-1;
87  }
88 
89  //! Create a sequence type
90  //! \param sq Character string that represent sequence
91  //! \param tr Track to be used to digitize sequence
92  sequence::sequence(char* sq, track* tr):mask(NULL){
93  length = 0;
94  max_mask=-1;
95  real = NULL;
96  seq = new(std::nothrow) std::vector<uint8_t>;
97 
98  if (seq==NULL){
99  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
100  exit(1);
101  }
102 
103  external= NULL;
104  attrib = -INFINITY;
105  realSeq = false;
106  seqtrk = tr;
107  undigitized = sq;
108  _digitize();
109  length=seq->size();
110  }
111 
112 
113  //! Create a sequence type
114  //! \param sq std::string string that represent sequence
115  //! \param tr Track to be used to digitize sequence
116  sequence::sequence(std::string& sq, track* tr):mask(NULL){
117  length = 0;
118  max_mask=-1;
119  real = NULL;
120  seq = new(std::nothrow) std::vector<uint8_t>;
121 
122  if (seq==NULL){
123  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
124  exit(1);
125  }
126 
127  external= NULL;
128  attrib = -INFINITY;
129  realSeq = false;
130  seqtrk = tr;
131  undigitized = sq;
132  _digitize();
133  length=seq->size();
134  }
135 
136  //!Destroy sequence type
138  delete seq;
139  delete real;
140  delete mask;
141 
142  seq = NULL;
143  real = NULL;
144  mask = NULL;
145  seqtrk = NULL;
146  external= NULL;
147  }
148 
149 
150 
151  //! Copy constructor for sequence
152  //!
154  realSeq = rhs.realSeq;
155  header = rhs.header;
156  attrib = rhs.attrib;
157  length = rhs.length;
158  seqtrk = rhs.seqtrk;
159  external= rhs.external; //Need copy constructor for this
160  max_mask= rhs.max_mask;
162 
163  if (rhs.seq!=NULL){
164  seq = new(std::nothrow) std::vector<uint8_t>(*rhs.seq);
165  if (seq==NULL){
166  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
167  exit(1);
168  }
169  }
170  else{
171  seq=NULL;
172  }
173 
174  if (rhs.real!=NULL){
175  real = new(std::nothrow) std::vector<double>(*rhs.real);
176  if (real==NULL){
177  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
178  exit(1);
179  }
180  }
181  else{
182  real=NULL;
183  }
184 
185  if (rhs.mask!=NULL){
186  mask = new(std::nothrow) std::vector<int>(*rhs.mask);
187  if (mask==NULL){
188  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
189  exit(1);
190  }
191  }
192  else{
193  mask = NULL;
194  }
195  }
196 
197 
198  //Clear the sequence and header
200  header = "";
201  undigitized="";
202  max_mask = -1;
203 
204  if (mask!=NULL){
205  delete mask;
206  mask = NULL;
207  }
208 
209  if (real!=NULL){
210  real->clear();
211  }
212 
213  if (seq!=NULL){
214  seq->clear();
215  }
216 
217  if (external!=NULL){
218  delete external;
219  }
220 
221  seqtrk = NULL;
222  length = 0;
223  attrib = -INFINITY;
224 
225  }
226 
228 
229  //Clean up if necessary
230  if (external != NULL){
231  delete external;
232  external = NULL;
233  }
234 
235  if (seq!= NULL){
236  delete seq;
237  seq = NULL;
238  }
239 
240  if (real!=NULL){
241  delete real;
242  seq = NULL;
243  }
244 
245  if (mask!= NULL){
246  delete mask;
247  mask = NULL;
248  }
249 
250  //Copy rhs over to this
251  realSeq = rhs.realSeq;
252  header = rhs.header;
253  attrib = rhs.attrib;
254  length = rhs.length;
255  seqtrk = rhs.seqtrk;
256  external= rhs.external;
257  max_mask= rhs.max_mask;
259 
260  if (rhs.seq!=NULL){
261  seq = new(std::nothrow) std::vector<uint8_t>(*rhs.seq);
262  if (seq==NULL){
263  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
264  exit(1);
265  }
266  }
267  else{
268  seq=NULL;
269  }
270 
271  if (rhs.real!=NULL){
272  real = new(std::nothrow) std::vector<double>(*rhs.real);
273  if (real==NULL){
274  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
275  exit(1);
276  }
277  }
278  else{
279  real=NULL;
280  }
281 
282  if (rhs.mask!=NULL){
283  mask = new(std::nothrow) std::vector<int>(*rhs.mask);
284  if (mask==NULL){
285  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
286  exit(1);
287  }
288  }
289  else{
290  mask = NULL;
291  }
292 
293  return *this;
294  }
295 
296 
297  //!Get digitized sequence value at a position
298  //! \param position Position in the sequence to get the value for
299  //! \return short integer value of symbol at positiion
300  uint8_t sequence::seqValue (size_t position){
301  if (!realSeq){
302  if (seq!=NULL){
303  return (*seq)[position];
304  }
305  else{
306  std::cerr << "sequence has not been digitized. \n";
307  exit(1);
308  }
309  }
310 
311  std::cerr << "Invalid sequence type queried in sequence class" <<std::endl;
312  exit(1);
313 
314  };
315 
316  //!Get real value of sequence at a position
317  //! \param position Position in the sequence to get the value
318  //! \return positioin
319  double sequence::realValue(size_t position){
320  if (realSeq){
321  if (real!=NULL){
322  return (*real)[position];
323  }
324  else{
325  std::cerr << "Values have not been imported.\n";
326  exit(1);
327  }
328 
329  }
330  std::cerr << "Invalid sequence type queried in sequence class" <<std::endl;
331  exit(1);
332  };
333 
334  //!Get std::string representation of the string
335  //!If the string is a real track, then it will return a string of doubles
336  //!If the string is a non-real track, then it will return a string of shorts, where the shorts are the digitized value of the sequence according to the track
337  //! \return std::string String representation of the sequence
338  std::string sequence::stringify(){
339  std::string output;
340  if (!header.empty()){
341  output+= header +"\n";
342  }
343 
344  if (!seq && !realSeq){
345  output+=undigitized;
346  }
347 
348  if (realSeq){
349  for(size_t i=0;i<length;i++){
350  output+= double_to_string((*real)[i]) + " ";
351  }
352  }
353  else{
354  for(size_t i=0;i<length;i++){
355  output+= int_to_string((int)(*seq)[i]) + " ";
356  }
357  }
358 
359 
360  if (mask){
361  output += "\n";
362  for(size_t i=0;i<length;i++){
363  output+= int_to_string((int)(*mask)[i]) + " ";
364  }
365  }
366 
367  output+="\n";
368  return output;
369  }
370 
371  //!Get the undigitized value of the string
372  //!If the string is a real-track then it will return the same as stringify()
373  //!If the string is a non-real track, it will return undigitized sequence
374  std::string sequence::undigitize(){
375 
376  if (realSeq){
377  return stringify();
378  }
379 
380  if (!seq){ //If the sequence is not digitized yet. Return the undigitized version
381  return undigitized;
382  }
383 
384  std::string output;
385 // if (!header.empty()){
386 // output+= header +"\n";
387 // }
388 
389  if (seqtrk!=NULL){
390  size_t alphaMax = seqtrk->getAlphaMax();
391 
392  for (size_t i=0;i<length;i++){
393  output+=seqtrk->getAlpha((*seq)[i]);
394  if (alphaMax!=1){
395  output+=" ";
396  }
397  }
398  }
399  else {
400  std::cerr << "Track is not defined. Can't undigitize sequence without valid track\n";
401  }
402 
403  return output;
404  }
405 
406 
407  //TODO: Fix return value to return false if no sequence was returned or parsed
408  //!Extract sequence from a fasta file
409  //! \param file File stream
410  //! \param trk Track to use for digitizing sequence
411  //! \return true if function was able to get a sequence from the file
412  bool sequence::getFasta(std::ifstream& file, track* trk,stateInfo* info){
413 
414  if (seq!=NULL){
415  this->clear();
416  }
417 
418  seqtrk=trk;
419 
420 
421 
422  if (!file.good()){
423  return false;
424  }
425 
426  //Find next header mark
427  while(file.peek() != '>'){
428  std::string temp;
429  getline(file,temp,'\n');
430 
431  if (!file.good()){
432  std::cerr << "Sequence doesn't contain a header \">\" "<< std::endl;
433  return false;
434  }
435  }
436 
437  getline(file,header,'\n');
438  bool success(false);
439  //get sequence
440  std::string line;
441  while(getline(file,line,'\n')){
442  undigitized+=line;
443 
444  char nl_peek=file.peek(); // see if we have new sequence header on the next line
445  if (nl_peek=='>'){
446  success = _digitize();
447  break;
448  }
449  else if (nl_peek=='['){
450  success = _digitize();
451  if (info == NULL){
452  std::cerr << "Found brackets [] in fasta sequence.\nHEADER: " << header << "\nCan't import External Definitions without stateInfo from HMM model. Pass stateInfo from model to " << __FUNCTION__ << std::endl;
453  exit(2);
454  }
455  else{
456  external= new (std::nothrow) ExDefSequence(seq->size());
457  external->parse(file, *info);
458  }
459 
460  }
461  else if (nl_peek==EOF){
462  getline(file,line,'\n');
463  success = _digitize();
464  break;
465  }
466  else{
467  continue;
468  }
469  }
470 
471  length=seq->size();
472  return success;
473  }
474 
475  bool sequence::getMaskedFasta(std::ifstream& file, track* trk){
476 
477  if (seq!=NULL){
478  this->clear();
479  }
480 
481  seqtrk=trk;
482 
483  if (!file.good()){
484  return false;
485  }
486 
487  //Find next header mark
488  while(file.peek() != '>'){
489  std::string temp;
490  getline(file,temp,'\n');
491 
492  if (!file.good()){
493  std::cerr << "Sequence doesn't contain a header \">\" "<< std::endl;
494  return false;
495  }
496  }
497 
498  bool success;
499  std::string line, mask_string;
500 
501  getline(file,header,'\n');
502 
503  if (file.good()) {
504  getline(file,undigitized,'\n');
505  success = _digitize();
506  length=seq->size();
507  }
508  if (file.good()) {
509  getline(file,mask_string,'\n');
510 
511  stringList lst;
512  //split string on space, comma, tab
513  clear_whitespace(mask_string,"\n\r");
514  lst.splitString(mask_string, " ,\t");
515  //alloc mask vector
516 
517  if (lst.size() == length) {
518 
519  if (mask!=NULL){
520  delete mask;
521  }
522 
523  mask=new(std::nothrow) std::vector<int>;
524  if (mask==NULL){
525  std::cerr << "OUT OF MEMORY\nFile" << __FILE__ << "Line:\t"<< __LINE__ << std::endl;
526  exit(1);
527  }
528  }
529  else {
530  std::cerr << "Mask length not equal to Sequence length." << std::endl;
531  return false;
532  }
533  //pass vector to lst.toVecInt
534  lst.toVecInt(*mask);
535  max_mask=*max_element(mask->begin(),mask->end());
536  }
537  else {
538  success = false;
539  }
540 
541  return success;
542 
543  }
544 
545  //!Digitize the sequence
547 
548  if (seqtrk==NULL){
549  std::cerr << "Can't digitize sequence without a valid track defined\n";
550  return false;
551  }
552 
553 
554 
555  stringList lst;
557  if (seqtrk->getAlphaMax()>1){
558  lst.splitString(undigitized, " ,\t");
559 
560  if (seq == NULL){
561  seq = new std::vector<uint8_t>(lst.size());
562  }
563  else{
564  seq->assign(lst.size(),0);
565  }
566 
567  for (size_t i=0;i<lst.size();i++){
568  uint8_t symbl = seqtrk->symbolIndex(lst[i]);
569 
570  //Check ambiguous here
571 // if (!seqtrk->isAmbiguousSet()){
572 // std::cerr << "Can't digitize ambiguous character without ambiguous characters being allowed in the model." << std::endl;
573 // return false;
574 // }
575 
576  (*seq)[i] = symbl;
577  }
578  }
579  else{
580  if (seq == NULL){
581  seq = new std::vector<uint8_t>(undigitized.size());
582  }
583  else{
584  seq->assign(undigitized.size(),0);
585  }
586 
587  for (size_t i=0; i<undigitized.size();i++){
588  uint8_t symbl = seqtrk->symbolIndex(undigitized[i]);
589 
590 
591 // //Check ambiguous here
592 // if (!seqtrk->isAmbiguousSet()){
593 // std::cerr << "Can't digitize ambiguous character without ambiguous characters being allowed in the model." << std::endl;
594 // return false;
595 // }
596  (*seq)[i] = symbl;
597  }
598  }
599 
600  undigitized.clear(); //Once sequence is digitized we don't need the old seqeunce string
601 
602  return true;
603  }
604 
605 
606  //! Import one fastq entry from the file
607  //!FastQ format:
608  //!Line 1: Start with @
609  //!Line 2: Sequence , Can be multiple lines
610  //!Line 3: Start with +
611  //!Line 4: Quality Score , Can be multiple lines
612  //! \param file File stream to file
613  //! \param trk Track to used to digitize
614  //! \return true if the sequence was successfully imported
615  bool sequence::getFastq(std::ifstream& file, track* trk){
616 
617  if (seq!=NULL){
618  this->clear();
619  }
620 
621  seqtrk=trk;
622 
623  if (!file.good()){
624  return false;
625  }
626 
627  //Find first line that starts with "@" and Get header
628 
629  //Move down until the next line has a "@"
630  while(file.peek() != '@' && file.good()){
631  std::string temp;
632  getline(file,temp,'\n');
633  }
634 
635  //Get Header (One line)
636  if (file.good()){
637  getline(file,header,'\n');
638  }
639  else{
640  return false;
641  }
642 
643 
644  std::string sequence="";
645  //Get sequence (Multiple Lines)
646  if (file.good()){
647  while(getline(file,sequence,'\n')){
649 
650  char nl_peek=file.peek(); // see if we have new sequence header on the next line
651  if (nl_peek=='+'){ //If next line begins with + then we are at a
652  break;
653  }
654  else if (nl_peek==EOF){
655  break;
656  }
657  else{
658  continue;
659  }
660  }
661  }
662  else{
663  return false;
664  }
665 
666  _digitize();
667 
668  std::string quality_string;
669 
670  //Get Quality String (Multiple Lines)
671  if (file.good()){
672  while(getline(file,sequence,'\n')){
673  quality_string+=sequence;
674 
675  char nl_peek=file.peek(); // see if we have new sequence header on the next line
676  if (nl_peek=='@'){ //If next line begins with + then we are at a
677  if (quality_string.size() < undigitized.size()){
678  continue;
679  }
680 
681  break;
682  }
683  else if (nl_peek==EOF){
684  break;
685  }
686  else{
687  continue;
688  }
689  }
690  }
691  else{
692  return false;
693  }
694 
695  length=seq->size();
696  return true;
697  }
698 
699 
700  //TODO: fix the return, so it returns false if sequence wasn't imported from file
701  //! Import one Real number sequence from the file
702  //! \param file File stream to file
703  //! \param trk Track to used to digitize
704  //! \return true if the sequence was successfully imported
705  bool sequence::getReal(std::ifstream& file, track* trk, stateInfo* info){
706 
707  if (real!=NULL){
708  this->clear();
709  }
710 
711  seqtrk=trk;
712 
713  //get header
714  while(file.peek() != '@' && !file.eof()){
715  std::string temp;
716  getline(file,temp,'\n');
717  }
718 
719  if (file.eof()){
720  std::cerr << "No header found for sequence. Header should start line with \"@\".\n";
721  exit(2);
722  }
723 
724  std::string sequence="";
725  getline(file,sequence,'\n');
727 
728  //get sequence
729  std::string line;
730  stringList lst;
731  while(getline(file,line,'\n')){
732  lst.splitString(line,",\t ");
733  std::vector<double> temp (lst.toVecDouble());
734  for(size_t i=0;i<temp.size();i++){
735  real->push_back(temp[i]);
736  }
737 
738  char nl_peek=file.peek(); // see if we have new sequence header on the next line
739  if (nl_peek=='>'){
740  _digitize();
741  break;
742  }
743  else if (nl_peek=='['){
744  _digitize();
745  if (info == NULL){
746  std::cerr << "Found brackets [] in fasta sequence.\nHEADER: " << header << "\nCan't import External Definitions without stateInfo from HMM model. Pass stateInfo from model to " << __FUNCTION__ << std::endl;
747  exit(2);
748  }
749  else{
750  external= new (std::nothrow) ExDefSequence(seq->size());
751  external->parse(file, *info);
752  }
753 
754  }
755  else if (nl_peek==EOF){
756  _digitize();
757  break;
758  }
759  else{
760  continue;
761  }
762  }
763 
764  length=real->size();
765  return true;
766  }
767 
768  //! Return the mask at sequence position
769  int sequence::getMask(size_t position) {
770 
771  if (mask!=NULL){
772  if (position < getLength()) {
773  return (*mask)[position];
774  }
775  else {
776  std::cerr << "Position exceeds sequence length.\n";
777  exit(1);
778  }
779  }
780  else{
781  std::cerr << "No Mask information.\n";
782  exit(1);
783  }
784  }
785 
786  //!Set the sequence from a std::string
787  //! \param sq Sequence to be used as sequence
788  //! \param tr Track to be used to digitize sequence
789  void sequence::setSeq (std::string& sq, track* tr){
790  realSeq= false;
791  undigitized = sq;
792 
793  if (tr!= NULL){
794  seqtrk = tr;
795  _digitize();
796  length = seq->size();
797  }
798  else{
799  length = sq.size();
800  }
801 
802  return;
803  }
804 
805  //!Set the sequence from a vector of doubles
806  //! \param rl Vector of doubles to be used as real number sequence
807  //! \param tr Track to be used to digitize sequence
808  void sequence::setRealSeq(std::vector<double>* rl, track* tr){
809  seqtrk = tr;
810  realSeq = true;
811  real=rl;
812 
813  length = rl->size();
814 
815  return;
816  }
817 
818 
819  //! Get the symbol (alphabet character or word) for a a given position of a alphanumerical sequence
820  //! \param pos Position within sequence
821  //! \return string String of the symbol
822  //! \sa track::getAlpha(short)
823  std::string sequence::getSymbol(size_t pos) const{
824  if (realSeq){
825  std::cerr << "Can't get symbol of real values\n";
826  return "";
827  }
828 
829  if (seqtrk==NULL){
830  std::cerr << "track is undefined for sequence\n";
831  return "";
832  }
833 
834  return seqtrk->getAlpha((*seq)[pos]);
835  }
836 
837 
838 // //! Get the index value of word at a given position and with a order of dependence)
839 // //! Calulates the index for a character/word and returns an Index type. If the word is non-ambiguous then it will return 1 value for row and column. First pair is column index, Second pair is row index. If symbol is ambiguous, index contains all possible words that match.
840 // //! \param[in] position Position within the sequence
841 // //! \param[in] order Order of dependence
842 // //! \param[out] word index pair of index values
843 // //! \sa Index
844 //
845 // void sequence::get_index(size_t position, int order, std::pair<Index,Index>& word_index){
846 // int letter=seqValue(position);
847 // size_t alphabet=seqtrk->getAlphaSize();
848 //
849 // Index& letter_index=word_index.first;
850 //
851 // if (letter<0){
852 // letter_index.setAmbiguous(seqtrk->getAmbiguousSet(letter));
853 // }
854 // else{
855 // letter_index+=letter;
856 // }
857 //
858 // Index& y_subtotal = word_index.second;
859 //
860 // if (order!=0){
861 //
862 // for(int k=order;k>=1;k--){
863 // int prev_letter=seqValue(position-k);
864 // Index word;
865 // if (prev_letter<0){
866 // word.setAmbiguous(seqtrk->getAmbiguousSet(prev_letter));
867 // y_subtotal+=word * POWER[k-1][alphabet-1];
868 // }
869 // else{
870 // y_subtotal+=prev_letter * POWER[k-1][alphabet-1];
871 // }
872 // }
873 // }
874 //
875 // return;
876 // }
877 
878  //! Reverse the sequence; If mask is defined, the mask will also be reversed
879  //! \return true if reverse was successfully performed on sequence
881  if (realSeq){
882  if (real!=NULL){
883  std::reverse(real->begin(), real->end());
884  return true;
885  }
886  }
887  else if (seq!=NULL){
888  std::reverse(seq->begin(), seq->end());
889 
890  if (mask!=NULL){
891  std::reverse(mask->begin(), mask->end());
892  }
893  return true;
894  }
895 
896  //Handle non-digitized sequence
897  if (seqtrk!=NULL){
898  size_t max_size = seqtrk->getAlphaMax();
899  if (max_size ==1){
900  if (undigitized.size()>0){
901  std::reverse(undigitized.begin(),undigitized.end());
902  return true;
903  }
904  else{
905  std::cerr << "No sequence is defined to reverse\n";
906  }
907  }
908  else{
909  std::cerr << "Reverse on undigitized sequence isn't defined for track alphabets that are more than one character\n";
910  }
911  }
912 
913  return false;
914  }
915 
916 
917  //! Complements the sequence.
918  //! \return true if complementation was successful
920  if (realSeq){
921  std::cerr<< "sequence::complement isn't defined for real valued sequences\n";
922  }
923  else if (seqtrk==NULL){
924  std::cerr << "StochHMM::track is not defined. Can't complement without defined complement in track\n";
925  }
926  else if (seq!=NULL){
927 
928  for (size_t i = 0; i < seq->size(); i++) {
929  (*seq)[i] = seqtrk->getComplementIndex((*seq)[i]);
930  }
931 
932  return true;
933  }
934  else {
935  size_t undigitized_size=undigitized.size();
936  if (undigitized_size>0){
937  size_t max_size = seqtrk->getAlphaMax();
938  if (max_size ==1){
939  for(size_t i=0;i<undigitized_size;i++){
940  std::string character = seqtrk->getComplementSymbol(undigitized[i]);
941  undigitized[i] = character[0];
942  }
943  return true;
944 
945  }
946  else{
947  std::cerr << "Complement on undigitized sequence isn't defined for track alphabets that are more than one character\n";
948  }
949 
950  }
951  else{
952  std::cerr << "No sequence defined\n";
953  }
954 
955  }
956  return false;
957  }
958 
959 
960  //! Reverses and complements the sequence
961  //! \return true if both reverse and complement were successful
963  if (!reverse()){
964  std::cerr << "Unable to perform reverseComplement on sequence because reverse failed\n";
965  return false;
966  };
967  if (!complement()){
968  std::cerr << "Unable to perform reverseComplement on sequence because complement failed\n";
969  return false;
970  };
971  return true;
972  }
973 
974  // Digitize a sequences
976  if (realSeq){
977  return true;
978  }
979  else if (undigitized.size() > 0){
980  _digitize();
981  return true;
982  }
983  else if (seq!=NULL){
984  std::cerr << "Digitized sequence already exists\n";
985  return true;
986  }
987 
988  std::cerr << "No undigitized sequence exists to convert\n";
989  return false;
990  }
991 
992 
993  //Shuffles the sequence
995 
996  if (realSeq){
997  std::random_shuffle(real->begin(), real->end());
998  }
999  else if (seq!=NULL){
1000  std::random_shuffle(seq->begin(), seq->end());
1001  }
1002  else{
1003  std::random_shuffle(undigitized.begin(), undigitized.end());
1004  }
1005  return;
1006  }
1007 
1008 
1009  //Randomly generate a sequence based on Probabilities of each character
1010  sequence random_sequence(std::vector<double>& freq, size_t length, track* tr){
1011  sequence random_seq;
1012 
1013  if (tr==NULL){
1014  std::cerr << "Track is not defined" << std::endl;
1015  return random_seq;
1016  }
1017 
1018  size_t alphaSize=tr->getAlphaSize();
1019  size_t freqSize=freq.size();
1020 
1021  if (alphaSize!=freqSize){
1022  std::cerr << "Frequency distribution size and Alphabet size must be the same." << std::endl;
1023  return random_seq;
1024  }
1025 
1026  //Create CDF of frequency distribution
1027  std::vector<std::pair<double,std::string> > cdf;
1028  double sum = 0.0;
1029  for(size_t i=0;i<freqSize;++i){
1030  sum+=freq[i];
1031  std::pair<double,std::string> val (sum, tr->getAlpha(i));
1032  cdf.push_back(val);
1033  }
1034 
1035  //Generate random sequence
1036  std::string random_string;
1037  for(size_t j=0;j<length;++j){
1038  double val = ((double)rand()/((double)(RAND_MAX)+(double)(1))); //Generate random
1039  for (size_t m=0;m<freqSize;++m){ //Check to see which value is
1040  if (cdf[m].first>=val){
1041  random_string+=cdf[m].second;
1042  break;
1043  }
1044  }
1045  }
1046 
1047  random_seq.setSeq(random_string, tr);
1048  return random_seq;
1049  }
1050 
1051 }