#include #include #include #define _GNU_SOURCE int main(int argc, char* argv[]) { FILE * pFile, *pFile1; int i,j,header_lines,individuals,ig,start_col,end; long snps; int countL,samples,lineC,posG,countH,GTpos,DSpos,bo; char *p,*p1,*saveptr1,*saveptr2; char * line = NULL; size_t len = 0; ssize_t read; float eps,s; unsigned int hpp; unsigned short gh1,gh2; float bdI,dsI,gl1I,gl2I,gl3I; char sst[500],IsnpName[5000], Imajor[2000000],Iminor[2000000],Ifilter[2000],Iinfo[20000],Iformat[2000],Iqual[2000]; unsigned short **g1; unsigned short **g2; float **ds; unsigned short *chrom; unsigned int *pos; float *freq; float *Lval; unsigned int *Lind; unsigned short *change; char **major; char **minor; char **snpName; char **filter; char **qual; char **info; char **format; eps=0.6; sst[0]=0; strcat(sst,argv[2]); strcat(sst,argv[1]); strcat(sst,".vcf"); pFile = fopen (sst,"r"); if (argc>3) { snps = atoi(argv[3]); } else { lineC=0; while ((getline(&line, &len, pFile)) != -1) ++lineC; rewind(pFile); } countL=0; j=0; while ((read = getline(&line, &len, pFile)) != -1) { j++; if ( line[0]=='#' ) { if ( line[1]=='C' ) { if (argc<=3) { snps = lineC-j; } header_lines = j; printf ("SNPs: %d\n", snps); snps+=10; j=-1; samples=0; p = strtok_r (line,"\t",&saveptr1); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_samples.txt"); pFile1 = fopen (sst,"w"); while (p != NULL) { samples++; if (samples>9) { fprintf(pFile1,"%d %s\n", (samples-9),p); } p = strtok_r (NULL, "\t",&saveptr1); } fclose (pFile1); individuals = samples-9; printf ("Individuals: %d\n", individuals); } } else { //################################################################## if (j==0) { g1 = calloc(individuals, sizeof(unsigned short *)); g1[0] = calloc((long) individuals*snps, sizeof(unsigned short)); for(i=0; i < individuals; i++) { g1[i] = g1[0]+i*snps; } g2 = calloc(individuals, sizeof(unsigned short *)); g2[0] = calloc((long) individuals*snps, sizeof(unsigned short)); for(i=0; i < individuals; i++) { g2[i] = g2[0]+i*snps; } ds = calloc(individuals, sizeof(float *)); ds[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { ds[i] = ds[0]+i*snps; } chrom = calloc((long) snps , sizeof(unsigned short)); pos = calloc((long) snps , sizeof(unsigned int)); freq = calloc((long) snps , sizeof(float)); Lval = calloc((long) snps , sizeof(float)); Lind = calloc((long) snps , sizeof(unsigned int)); change = calloc((long) snps , sizeof(unsigned short)); // char *Imajor = calloc(2000000, sizeof(char)); // char *Iminor = calloc(2000000, sizeof(char)); major = calloc(snps, sizeof(char *)); major[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { major[i] = major[0]+i*200; } minor = calloc(snps, sizeof(char *)); minor[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { minor[i] = minor[0]+i*200; } snpName = calloc(snps, sizeof(char *)); snpName[0] = calloc((long) 50*snps, sizeof(char)); for(i=0; i < snps; i++) { snpName[i] = snpName[0]+i*50; } filter = calloc(snps, sizeof(char *)); filter[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { filter[i] = filter[0]+i*20; } qual = calloc(snps, sizeof(char *)); qual[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { qual[i] = qual[0]+i*20; } info = calloc(snps, sizeof(char *)); info[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { info[i] = info[0]+i*200; } format = calloc(snps, sizeof(char *)); format[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { format[i] = format[0]+i*20; } } //################################################################## if (j%1000==0) { printf("Read SNP: %d\r",j); } i=0; samples=0; p = strtok_r (line,"\t",&saveptr1); while (p != NULL) { switch(samples) { case 0: sscanf(p,"%hu", &gh1); chrom[j] = gh1; break; case 1: sscanf(p,"%u", &hpp); pos[j] = hpp; break; case 2: sscanf(p,"%s", IsnpName); strncat(snpName[j],IsnpName,49); break; case 3: sscanf(p,"%s", Imajor); strncat(major[j],Imajor,190); break; case 4: sscanf(p,"%s", Iminor); strncat(minor[j],Iminor,190); break; case 5: sscanf(p,"%s", Iqual); strncat(qual[j],Iqual,19); break; case 6: sscanf(p,"%s", Ifilter); strncat(filter[j],Ifilter,19); break; case 7: sscanf(p,"%s", Iinfo); strncat(info[j],Iinfo,190); break; case 8: sscanf(p,"%s",Iformat); strncat(format[j],Iformat,19); p1 = strtok_r (p,":",&saveptr2); GTpos=0; DSpos=0; posG=0; while (p1 != NULL) { posG++; if (strcmp (p1,"GT") == 0) { GTpos=posG; } if (strcmp (p1,"DS") == 0) { DSpos=posG; } p1 = strtok_r (NULL, ":",&saveptr2); } if (GTpos==0) { printf("Error in SNP %d and Line %d: no GT found!\n",(countL- header_lines), countL); return(1); } break; default: p1 = strtok_r (p,":",&saveptr2); gh1=0; gh2=0; dsI=0.0; posG=0; while (p1 != NULL) { posG++; if (posG==GTpos) { bo=sscanf(p1,"%hu|%hu", &gh1,&gh2); if (bo<2) { bo=sscanf(p1,"%hu/%hu", &gh1,&gh2); } } if (posG==DSpos) { sscanf(p1,"%f", &dsI); } p1 = strtok_r (NULL, ":",&saveptr2); } g1[i][j] = gh1; g2[i][j] = gh2; ds[i][j] = dsI; i++; break; } samples++; p = strtok_r (NULL, "\t",&saveptr1); } } countL++; } fclose (pFile); snps= j+1; printf ("\n Reconfirmed SNPs: %d\n", snps); // check for minor allele and change for(j = 0; j < snps; j ++) { for(ig=0,i = 0; i < individuals; i ++) { if (g1[i][j]>0) { ig++; } if (g2[i][j]>0) { ig++; } } freq[j] = (float) (1.0*ig)/(2.0*individuals); change[j]=0; if (ig>individuals) { change[j]=1; for(i = 0; i < individuals; i ++) { g1[i][j]= 1 - g1[i][j]; g2[i][j]= 1 - g2[i][j]; ds[i][j] = 2.0 - ds[i][j]; } } } //check for high probable snps /* for(j = 0; j < snps; j ++) */ /* { */ /* for(i = 0; i < individuals; i ++) { */ /* if (p1[i][j]<0.8) { */ /* p1[i][j]=0.0; */ /* } */ /* if (p2[i][j]<0.8) { */ /* p2[i][j]=0.0; */ /* } */ /* } */ /* } */ sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_annot.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"Individuals: %d\n",individuals); fprintf(pFile,"SNPs : %d\n",snps); for(j = 0; j < snps; j ++) { fprintf(pFile,"%d %d %s %s %s %s %s %s %s %.8f %d\n",chrom[j],pos[j], snpName[j], major[j], minor[j],qual[j],filter[j],info[j],format[j],freq[j],change[j]); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_mat.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",2*individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Haplotype: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { if (g1[i][j]>0){ Lind[ig] = j; Lval[ig] = 1.0; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.1f ",Lval[j]); fprintf(pFile,"\n"); ig=0; for(j = 0; j < snps; j ++) { if (g2[i][j]>0){ Lind[ig] = j; Lval[ig] = 1.0; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.1f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_matG.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Genotype: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { s = g1[i][j]+g2[i][j]; if (s>eps){ Lind[ig] = j; Lval[ig] = s; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.1f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_matD.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Dosage: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { s = ds[i][j]; if (s>eps){ Lind[ig] = j; Lval[ig] = s; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.3f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); return 0; }