StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
sequence.h
Go to the documentation of this file.
1 //
2 // sequence.h
3 //Copyright (c) 2007-2012 Paul C Lott
4 //University of California, Davis
5 //Genome and Biomedical Sciences Facility
6 //UC Davis Genome Center
7 //Ian Korf Lab
8 //Website: www.korflab.ucdavis.edu
9 //Email: lottpaul@gmail.com
10 //
11 //Permission is hereby granted, free of charge, to any person obtaining a copy of
12 //this software and associated documentation files (the "Software"), to deal in
13 //the Software without restriction, including without limitation the rights to
14 //use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
15 //the Software, and to permit persons to whom the Software is furnished to do so,
16 //subject to the following conditions:
17 //
18 //The above copyright notice and this permission notice shall be included in all
19 //copies or substantial portions of the Software.
20 //
21 //THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 //IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
23 //FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
24 //COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
25 //IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
26 //CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 
28 #ifndef SEQUENCE_H
29 #define SEQUENCE_H
30 
31 #include <vector>
32 #include <stdlib.h>
33 #include <string>
34 #include <iostream>
35 #include <math.h>
36 #include <fstream>
37 #include <algorithm>
38 #include "text.h"
39 #include "track.h"
40 #include "stateInfo.h"
41 #include "externDefinitions.h"
42 #include "index.h"
43 
44 //! \file
45 
46 namespace StochHMM{
47  //! \class sequence
48  //! Contains individual sequence information and functions to deal with importing and digitizing the sequence
49  //! Sequence can be either real numbers (double values) or sequence(characters or words) discrete values
50  //! class sequence supports 255 discrete values.
51  class sequence{
52  public:
53 
54  //Constructors
55 
56  sequence();
57  sequence(bool); //True if Real number track, False if alpha
58  //sequence(trackType);
59  sequence(std::vector<double>*,track*);
60  sequence(std::string&, track*);
61  sequence(char* , track*);
62 
63  ~sequence();
64 
65  //Copy Constructors
66  sequence(const sequence&);
67  sequence& operator= (const sequence&);
68 
69  friend class sequences;
70  friend class sequenceStream;
71 
72  //ACCESSOR
73 
74  //!Get reference to undigitized sequence
75  //!If sequence hasn't been undigitized then it will undigitize it and
76  //!store the result. (Only undigitizes the sequence once, then passes
77  //!reference to undigitized sequence)
78  inline std::string* getUndigitized(){
79  if (!undigitized.empty() || seq->empty()){
80  return &undigitized;
81  }
82  else {
84  return &undigitized;
85  }
86  }
87 
88  //!Get the size of the sequence
89  inline size_t getLength(){return length;};//Returns length of sequence
90 
91  //!Get the attribute value for the sequence
92  //!Selection of model may use this value to determine which model to use
93  //! \sa setAttrib
94  inline double getAttrib(){return attrib;}; //Returns the Attribute value for the sequence
95 
96  //!Get pointer to ExDefSequence for the sequence
97  //! \return ExDefSequence*
98  inline ExDefSequence* getExDef(){return external;};
99 
100  //!Check to see if exDef is defined for the sequence
101  //! \return true if ExDefSequence is defined for sequence
102  //! \return false if no External definition exists for sequence
103  inline bool exDefDefined(){if (external){return true;} return false;};
104 
105  double realValue(size_t); // Returns Sequence Value at position
106  uint8_t seqValue (size_t); // Returns Digitized Value at position
107  //char charValue(size_t); // Returns Alpha Character Value at position
108 
109  //!Get the size of the sequence
110  //! \return size_t size of the sequence
111  inline size_t size(){if (realSeq){return real->size();} else {return seq->size();}}; // Returns size of sequence
112 
113  //! Get the pointer to the track that is defined for the sequence;
114  //! \return pointer to track
115  inline track* getTrack(){return seqtrk;};
116 
117  inline void setTrack(track* tr){
118  seqtrk = tr;
119  return;
120  }
121 
122 
123  //! Print the string represntation of the sequence to stdout
124  //! Prints the digitized version
125  inline void print(){std::cout << stringify() << std::endl;}; //Print sequence to stdout
126  std::string stringify(); // Get sequence as string
127 
128 
129  //! Undigitize the sequence
130  //! If the sequence has not been digitized then it will return directly
131  //! If the sequence has been digitized then it will undigitize it and return it
132  //! \return character or word sequence from fasta
133  std::string undigitize();
134 
135  //MUTATOR
136  //!Set the sequence attribute value
137  //!\param attr Value of attributes for sequence;
138  inline void setAttrib(double attr){attrib=attr;}; //!Set the attribute value
139 
140  //!Set the header of the sequence
141  //!\param head Header of the sequence
142  inline void setHeader(std::string& head){header=head;};
143 
144  void setSeq(std::string&,track*);
145  void setRealSeq(std::vector<double>*,track*);
146 
147  inline bool getFasta(std::ifstream& file){return getFasta(file,NULL,NULL);}
148  inline bool getFasta(std::ifstream& file, track* trk){ return getFasta(file,trk,NULL);}
149  bool getFasta(std::ifstream&, track*, stateInfo*);
150 
151 
152  bool getMaskedFasta(std::ifstream&, track*);
153  bool getFastq(std::ifstream&, track*);
154 
155  inline bool getReal (std::ifstream& file){return getReal(file,NULL,NULL);}
156  inline bool getReal (std::ifstream& file, track* trk){ return getReal(file,trk,NULL);}
157  bool getReal (std::ifstream&, track*, stateInfo*);
158 
159  int getMaxMask(){return max_mask;}
160  int getMask(size_t);
161 
162  std::string getSymbol(size_t) const;
163 
164  void get_index(size_t position, int order, std::pair<Index, Index>& word_index);
165 
166 
167  //! Returns the header of the sequence as a std::string
168  inline std::string getHeader() { return header; }
169 
170  bool reverseComplement();
171  bool complement();
172  bool reverse();
173 
174  //!Converts sequence digital based on track alphabet
175  bool digitize();
176 
177  //! Shuffles the sequence using std::random_shuffle
178  void shuffle();
179 
180  inline std::vector<uint8_t>* getDigitalSeq(){return seq;}
181 
182  inline uint8_t operator[](size_t index){return (*seq)[index];}
183 
184 
185  //!Empty Sequence
186  void clear();
187 
188 
189  //void getNext (std::ifstream&, track*);
190 
191 
192  //bool _checkSequence(); //!Check the sequence adheres to the track alphabet
193 
194  private:
195  bool realSeq; //If Real number sequence
196  std::string header; // Header from the sequence
197 
198  double attrib; //Attribute value (Could be %GC or whatever user defines)
199  size_t length; //Lenght of the Sequence
200 
201  track* seqtrk; //Ptr to track describing alphabet and type
202 
203  ExDefSequence* external; //External definitions
204  //Stores defined states for given sequence
205 
206  // FIXME:: DIGITIZED SEQUENCES STORED AS SHORT. NEED TO STANDARDIZE BOTH TRACK AND SEQUENCE CLASS (Track stores as (int) but sequence stores as short.
207  std::vector<uint8_t>* seq; // Digitized Sequence
208  std::vector<double>* real; // Real Number Sequence
209  std::vector<int>* mask; //Stores State masking information for training
210  int max_mask; //Maximum mask number
211 
212 
213  std::string undigitized; //Undigitized sequence
214 
215  bool _digitize(); //Digitize the sequence
216  };
217 
218 
219 
220  //!Randomly generate a sequence based on Probabilities of each character
221  //! \param freq Reference to std::vector<double> that contains frequencies of alphabet corresponding to alphabet in track
222  //! \param length Length of sequence to generate
223  //! \param tr Pointer to StochHMM::track where alphabet and ambiguous characters are defined
224  sequence random_sequence(std::vector<double>& freq, size_t length, track* tr);
225 // sequence random_sequence(emm*);
226 // sequence translate();
227 
228 }
229 #endif /*SEQUENCE_H*/