#include #include #include #include #include #include #include #include #include #include #include using namespace std; // Interface to gsl_cdf_hypergeometric_Q with some add-on behaviour: // gsl_cdf_hypergeometric_Q returns P(x>k) this interface returns P(x>=k) double hypgeom_Q(unsigned int N, unsigned int K, unsigned int n, unsigned int k){ if(k==0) //special case P(x>=k|k=0)= 1 return 1; else // normal case, should give P(x>=k) return gsl_cdf_hypergeometric_Q(k-1,K,N-K,n); } // Generates random permutation of length N, containing K true-values and N-K false-values bool* get_random_genome(unsigned int N, unsigned int K, const gsl_rng* rng){ bool* perm= new bool[N]; unsigned int i; for(i=0; i[0], and ending, $C[i]->[1], indices of segment i. */ vector get_segments(bool* b, int N, int maxgap){ vector segms; int* segm; int i1=0; int i2=0; int i3=0; while(i1 breaking loop2 and loop1 break; } else{ i2++; } } else{ i3=i2; while(i3maxgap){ segm= new int[2]; segm[0]= i1; segm[1]= i2-1; segms.push_back(segm); i1=i3; i2=N; // back to loop1 -> breaking loop3 and loop2 break; } else if(b[i3]){ i2=i3; break; // back to loop2 -> beaking loop3 } else if(i3==N-1){ segm= new int[2]; segm[0]=i1; segm[1]= i2-1; segms.push_back(segm); i2=N; // iterated till the end of array -> breaking loop3, loop2 and loop1 i1=N; break; } else{ i3++; } } //close loop3 } } //close loop2 } } //close loop1 return segms; } vector get_nk_values(bool* b, vector segms){ vector nks; unsigned int* nk; unsigned int n,k; for(unsigned int i=0; i bootstrap(int N, int K, vector Ps, int I, int maxgap, unsigned long seed){ gsl_rng * rng = gsl_rng_alloc (gsl_rng_taus); gsl_rng_set (rng, seed); bool* genome; vector segms; vector segms_temp; vector nk; // (n,k) value pairs obtained from segments vector p_sim; vector p_sim_min(I,0); for(int i=0; i Ps_adj(Ps.size(), 0 ); for(unsigned int i=0; i) ); int j= 0; double p= Ps[i]; while(j segms; vector nks; gsl_rng * rng = gsl_rng_alloc (gsl_rng_taus); gsl_rng_set (rng, time(NULL)); cout << "# Testing clustering, with maxgap=" << maxgap << "\n"; cout << "Iteration\tVector\tSegments\t(n,k)\n"; cout << "\t"; for(int i=0; i=k_th){ if(!counted){ count++; counted=true; } } cout << "\n"; } cout << "# random genomes with at least one window with k>=" << k_th << "\n" << count << "/" << I << "\n"; } int main (int argc, char** argv) { // TESTING //test_get_random_genome(20,10,10); // test_gen_random_genome(N,K,Iterations) //test_get_random_genome(20,0,5); //test_get_clusters(2751,76,5,100,5); //test_get_clusters(N,K,k_th,Iterations,maxgap) //test_get_clusters(20,10,10,3); //checking input if(argc < 3){ cerr << "USAGE:\n" << argv[0] << " filein fileout\n" << "Where filein contains two blocks of data delimited by an empy line.\n" << "First block should contain lines composed of a header a tab and a \n" << "header value. Required headers are N,K and maxgap. Lines with required\n" << "headers can be in any order, lines with other headers are ignored.\n" << "The second block should start with a line of tab-delimited headers,\n" << "including, but not limiting to, \'n\' and \'k\'. The corresponding\n" << "unsigned integer values should be delimited by tabs and layed out\n" << "starting from the second line of this block.\n"; exit(1); } unsigned int N,K,maxgap; char* fin= argv[1]; ifstream in (fin , ifstream::in); char buffer[256]; string header; //Reading data: Block 1 while(in.getline(buffer,256)){ string line(buffer); if(line == "") break; //end of Block 1 istringstream iss (line,istringstream::in); iss >> header; if(header == "N"){ iss >> N; }else if(header == "K"){ iss >> K; }else if(header == "maxgap"){ iss >> maxgap; } } //Reading data: Block 2: headers in.getline(buffer,256); string line(buffer); istringstream iss (line,istringstream::in); vector map; unsigned int col_count=0; while(iss >> header){ if(header == "n"){ map.push_back(0);} else if(header == "k"){ map.push_back(1);} else{ map.push_back(-1); } col_count++; } //Reading data: Block 2: data vector ns; vector ks; vector data[]= {ns,ks}; unsigned int counter=0; int col; unsigned int value; string rubbish; do{ col= map[counter++%col_count]; if(col>=0){ in >> value; if(in) data[col].push_back(value); } else{ in >> rubbish; } }while(in); in.close(); ns= data[0]; ks= data[1]; // Calculating P-values and P_adj-values unsigned int I= 1000; unsigned long seed= time(NULL); vector Ps(ns.size(),0); for(unsigned int i=0; i Ps_adj; Ps_adj= bootstrap(N,K,Ps,I,maxgap,seed); // Writing output: Block 1 char* fout= argv[2]; ofstream out (fout); out << "N\t" << N << "\n"; out << "K\t" << K << "\n"; out << "maxgap\t" << maxgap << "\n\n"; // Writing output: Block 2 out << "n\tk\tP\tP_adj\n"; for(unsigned int i=0; i