class Decoymaker
Public Class Methods
make_decoys(p1, p2, p3, p4)
click to toggle source
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in, VALUE db_length_in,VALUE output_file_in,VALUE prefix_string_in) { char *infile = StringValueCStr(input_file_in); long sequences_to_generate = NUM2INT(db_length_in); char * outfile = StringValueCStr(output_file_in); char *prefix_string = StringValueCStr(prefix_string_in); char line[MAX_LINE_LENGTH]; // char settings_line[60][70]; char *p,**index; char one_sequence[MAX_SEQUENCE_LENGTH]; char random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)]; char random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)]; char *temp_sequence; int a; FILE *inp, *outp; long i, j, k, l, n, n_sequences, protein; long MP[21][MAX_SEQUENCE_LENGTH]; long measured_aa_freq[21], generated_aa_freq[21], measured_pl_sum=0, generated_pl_sum=0; long row_sum[MAX_SEQUENCE_LENGTH],partial_sum; long one_index,pl; double x; /* scanning sequence database */ if ((inp = fopen(infile, "r"))==NULL) { printf("error opening sequence database %s\n",infile);return -1; } long total_sequence_len=0; n=0; while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) { total_sequence_len+=strlen(line); // printf("%ld\n",i);fflush(stdout); if (line[0]=='>') { n++; } } n_sequences=n; /* reading sequence database */ temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH); char *sequence_block=(char*)malloc(sizeof(char)*(total_sequence_len+2)); index=(char**)malloc(sizeof(char*)*n_sequences); index[0]=sequence_block; /* set first index pointer to beginning of first database sequence */ if ((inp = fopen(infile, "r"))==NULL) { printf("error opening sequence database %s\n",infile); return -1; } n=-1; strcpy(temp_sequence,"\0"); while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) { RemoveSpaces(line); if (strcmp(line,"\n")==0) { // Skips blank lines continue; } if (line[0]=='>') { if (n>=0) { strcpy(index[n],temp_sequence); n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence); strcpy(temp_sequence,"\0"); } else { n++; strcpy(temp_sequence,"\0"); } } else { if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) { continue; } strncat(temp_sequence,line,strlen(line)-1); } } strcpy(index[n],temp_sequence); fclose(inp); n_sequences=n+1; // printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout); // measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence); /* generating Markov probabilities */ // printf("generating Markov probability matrix..."); // fflush(stdout); srand(time(0)); /* replace with constant to re-generate identical random databases */ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) { for(j=0;j<=20;j++) { MP[j][i]=0; } } for(j=0;j<=20;j++) { measured_aa_freq[j]=0; generated_aa_freq[j]=0; } for(protein=0;protein<n_sequences;protein++) { if (protein<(n_sequences-1)) { long len_one_seq = (index[protein+1]-index[protein])/sizeof(char); if ( len_one_seq > MAX_SEQUENCE_LENGTH ){ printf("Seq is longer than max len \n");fflush(stdout); len_one_seq=MAX_SEQUENCE_LENGTH; } strncpy(one_sequence,index[protein],len_one_seq); one_sequence[len_one_seq]='\0'; // NULL terminate the string } else { strcpy(one_sequence,index[protein]); } pl=strlen(one_sequence); n=1; one_index=0; for(i=0;i<pl;i++) { if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL) { if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL) { printf("Unknown amino acid %c",one_sequence[i]); } else { a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid MP[a][i]++; measured_aa_freq[a]++; } } else { a=floor(20*(float)rand()/RAND_MAX); MP[a][i]++; measured_aa_freq[a]++; } // replace B, X, Z etc. with random amino acid to preserve size distribution } MP[20][pl]++; measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database } for(i=0;i<MAX_SEQUENCE_LENGTH;i++){ row_sum[i]=0; } for(i=0;i<MAX_SEQUENCE_LENGTH;i++){ for(j=0;j<=20;j++){ row_sum[i]+=MP[j][i]; } } /* generate random protein sequences through Markov chain */ if ((outp = fopen(outfile, "w"))==NULL) { printf("error opening output file %s\n",outfile); return -1; } for(protein=0;protein<sequences_to_generate;protein++) { i=0; j=0; while (1) { x=(double)row_sum[j]*((double)rand()/RAND_MAX); partial_sum=MP[0][j]; i=1; while (partial_sum<x) {partial_sum+=MP[i][j]; i++;} if (j>=MAX_SEQUENCE_LENGTH) { i=21; }/* terminate when sequence has reached MAX_SEQUENCE_LENGTH */ if (i<21) { random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++; } else { /* i==21, i.e. protein sequence terminated */ k=0; generated_aa_freq[20]++; generated_pl_sum+=j; for(l=0;l<j;l++) { random_sequence_output[k]=random_sequence[l]; k++; if (!((k+1)%61)) { random_sequence_output[k]='\n'; k++; } } random_sequence_output[k]='\0'; if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */ fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output); break; } } } fclose(outp); // printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile); k=0;l=0; for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];} // printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate); return 0; }