StochHMM  v0.34
Flexible Hidden Markov Model C++ Library and Application
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
trainingSeq.cpp
Go to the documentation of this file.
1 //
2 // trainingSeq.cpp
3 // StochHMM
4 //
5 // Created by Paul Lott on 3/5/12.
6 // Copyright (c) 2012 University of California, Davis. All rights reserved.
7 //
8 
9 #include "trainingSeq.h"
10 
11 sequence::sequence(string& fileName){
12  file=new std::ifstream;
13  file->open(fileName.c_str());
14  if (!file->is_open()){
15  cerr << "Couldn't find file: " << fileName <<endl;
16  exit (1);
17  }
18  masked=false;
19 
20  return;
21 }
22 
23 bool sequence::next(){
24  header="";
25  DNA="";
26  seq.clear();
27 
28  if (file==NULL || file->eof()){
29  return false;
30  }
31 
32  //Find first Header line then import header
33  while(file->peek() != '>'){
34  std::string temp;
35 
36  if (file->eof()){
37  return false;
38  }
39 
40  getline(*file,temp,'\n');
41  }
42  getline(*file,header,'\n');
43 
44  if (file->eof()){
45  return false;
46  }
47 
48  std::string temp;
49  //get sequence
50  while(getline(*file,temp,'\n')){
51 
52  for(int j=0;j<temp.size();j++){
53  temp[j]=toupper(temp[j]);
54  switch (temp[j]){
55  case 'A':
56  seq.push_back(0);
57  break;
58  case 'C':
59  seq.push_back(1);
60  break;
61  case 'G':
62  seq.push_back(2);
63  break;
64  case 'T':
65  seq.push_back(3);
66  break;
67  default:
68  temp[j]='N';
69  seq.push_back(-1);
70  }
71  }
72 
73  DNA+=temp;
74 
75  char nl_peek=file->peek(); // see if we have new sequence header on the next line
76  if (nl_peek=='>'){
77  break;
78  }
79  else if (nl_peek==EOF){
80  break;
81  }
82  else{
83  continue;
84  }
85  }
86 
87  if(masked){
88  importMask();
89  if (mask.size() != seq.size()){
90  cerr << "Mask not the same size as sequence\n";
91  }
92  }
93 
94  return true;
95 }
96 
97 bool sequence::importMask(){
98  mask.clear();
99 
100  if (file==NULL || file->eof()){
101  return false;
102  }
103 
104  std::string temp;
105  //Find first Header line then import header
106  while(file->peek() != '>'){
107 
108  if (file->eof()){
109  return false;
110  }
111 
112  getline(*file,temp,'\n');
113  }
114  getline(*file,temp,'\n');
115 
116  if (file->eof()){
117  return false;
118  }
119 
120  //get sequence
121  while(getline(*file,temp,'\n')){
122 
123  char * str=new char[temp.size()+1];
124  str[temp.size()]=0;
125  memcpy(str, temp.c_str(), temp.size());
126 
127  char * pch;
128  pch = strtok (str," ,.");
129  while (pch != NULL)
130  {
131  mask.push_back(atoi(pch));
132  pch = strtok (NULL, " ,.");
133  }
134 
135  char nl_peek=file->peek(); // see if we have new sequence header on the next line
136  if (nl_peek=='>'){
137  break;
138  }
139  else if (nl_peek==EOF){
140  break;
141  }
142  else{
143  continue;
144  }
145  }
146 
147  return true;
148 }
149 
150 //Creates the reverse compliment of a sequence
152  size_t length = seq.size();
153  for (int i=0;i<length;i++){
154  switch (seq[i]){
155  case 0:
156  seq[i]=3; break;
157  case 1:
158  seq[i]=2; break;
159  case 2:
160  seq[i]=1; break;
161  case 3:
162  seq[i]=0; break;
163  default:
164  seq[i]=-1;
165  }
166  }
167  reverse(seq.begin(),seq.end());
168  return;
169 }
170 
171 
172 bool trainingSeqs::openFiles(std::string& filename){
173 
174 }
175 
176 
178 
179 }
180 
181 
182 //Read upto million characters and determine the alphabet of that sequence
183 //Return a vector<std::string>
185  const std::ifstream* file(seqFile[iter]);
186 
187  //Import a buffer full of the sequence
188  std::string temp;
189  std::string undigitized;
190 
191  while(getline(*file,temp,'\n')){
192  if (temp[0]== '>'){
193  continue;
194  }
195  else{
196  undigitized+=temp;
197  }
198 
199  if (undigitized.size() >= MAX_BUFFER){
200  break;
201  }
202  }
203 
204  std::set<std::string> alphabet;
205  pair<std::set<int>::iterator,bool> ret;
206 
207  for(size_t i=0;i<undigitized.size();i++){
208  ret = alphabet.insert(undigitized[i]);
209  if (ret.second()){
210  indexAlpha[iter].push_back(undigitized[i]);
211  }
212  }
213 
214  for(size_t i=0;i<indexAlpha[iter].size();i++){
215  string &temp= indexAlpha[iter][i];
216  alphaIndex[iter][temp]=i;
217  }
218 
219  file.seekg(0,std::ios::beg);
220  file.clear();
221 
222  return;
223 }
224 
225 void trainingSeqs::setAlphabet(size_t iter, stringList& lst){
226  for(size_t i=0;i<lst.size();i++){
227  string& temp = lst[i];
228  indexAlpha[iter].push_back(temp);
229  alphaIndex[iter][temp]=i;
230  }
231  return;
232 }
233 
234 
235 // bool sequence::getFasta(std::ifstream& file, track* trk){
236 //
237 // seqtrk=trk;
238 //
239 // if (!file.good()){
240 // return false;
241 // }
242 //
243 // //Find next header mark
244 // while(file.peek() != '>'){
245 // std::string temp;
246 // getline(file,temp,'\n');
247 // }
248 //
249 // getline(file,header,'\n');
250 //
251 // //get sequence
252 // std::string line;
253 // while(getline(file,line,'\n')){
254 // undigitized+=line;
255 //
256 // char nl_peek=file.peek(); // see if we have new sequence header on the next line
257 // if (nl_peek=='>'){
258 // _digitize();
259 // break;
260 // }
261 // else if (nl_peek=='['){
262 // _digitize();
263 // //external=getExDef(file,nextSeq->size());
264 // }
265 // else if (nl_peek==EOF){
266 // _digitize();
267 // break;
268 // }
269 // else{
270 // continue;
271 // }
272 // }
273 //
274 // length=seq->size();
275 // return true;
276 // }
277 //
278 // void sequence::_digitize(){
279 // stringList lst;
280 // clear_whitespace(undigitized,"\n");
281 // if (seqtrk->getAlphaMax()>1){
282 // lst.splitString(undigitized, " ,\t");
283 // }
284 // else{
285 // lst.fromAlpha(undigitized, 1);
286 // }
287 //
288 // for (size_t i=0;i<lst.size();i++){
289 // short symbl = seqtrk->symbolIndex(lst[i]);
290 // seq->push_back(symbl);
291 // }
292 //
293 // return;
294 // }