#include #include #include #define _GNU_SOURCE int main(int argc, char* argv[]) { FILE * pFile; int i,j,header_lines,individuals,snps,ig,start_col,end; snps = 3530000; individuals = 1092; header_lines = 27; start_col = 9; unsigned short **g1 = calloc(individuals, sizeof(unsigned short *)); g1[0] = calloc((long) individuals*snps, sizeof(unsigned short)); for(i=0; i < individuals; i++) { g1[i] = g1[0]+i*snps; } unsigned short **g2 = calloc(individuals, sizeof(unsigned short *)); g2[0] = calloc((long) individuals*snps, sizeof(unsigned short)); for(i=0; i < individuals; i++) { g2[i] = g2[0]+i*snps; } float **ds = calloc(individuals, sizeof(float *)); ds[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { ds[i] = ds[0]+i*snps; } float **bd = calloc(individuals, sizeof(float *)); bd[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { bd[i] = bd[0]+i*snps; } float **gl1 = calloc(individuals, sizeof(float *)); gl1[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { gl1[i] = gl1[0]+i*snps; } float **gl2 = calloc(individuals, sizeof(float *)); gl2[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { gl2[i] = gl2[0]+i*snps; } float **gl3 = calloc(individuals, sizeof(float *)); gl3[0] = calloc((long) individuals*snps, sizeof(float)); for(i=0; i < individuals; i++) { gl3[i] = gl3[0]+i*snps; } unsigned short *chrom = calloc((long) snps , sizeof(unsigned short)); unsigned int *pos = calloc(snps , sizeof(unsigned int)); unsigned int hpp; float *freq = calloc((long) snps , sizeof(float)); float *Lval = calloc((long) snps , sizeof(float)); unsigned int *Lind = calloc((long) snps , sizeof(unsigned int)); unsigned short gh1,gh2; float bdI,dsI,gl1I,gl2I,gl3I; unsigned short *change = calloc(snps , sizeof(unsigned short)); char sst[500],IsnpName[5000], Imajor[2000000],Iminor[2000000],Ifilter[2000],Iinfo[20000],Iformat[2000],Iqual[2000]; // char *Imajor = calloc(2000000, sizeof(char)); // char *Iminor = calloc(2000000, sizeof(char)); char **major = calloc(snps, sizeof(char *)); major[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { major[i] = major[0]+i*200; } char **minor = calloc(snps, sizeof(char *)); minor[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { minor[i] = minor[0]+i*200; } char **snpName = calloc(snps, sizeof(char *)); snpName[0] = calloc((long) 50*snps, sizeof(char)); for(i=0; i < snps; i++) { snpName[i] = snpName[0]+i*50; } char **filter = calloc(snps, sizeof(char *)); filter[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { filter[i] = filter[0]+i*20; } char **qual = calloc(snps, sizeof(char *)); qual[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { qual[i] = qual[0]+i*20; } char **info = calloc(snps, sizeof(char *)); info[0] = calloc((long) 200*snps, sizeof(char)); for(i=0; i < snps; i++) { info[i] = info[0]+i*200; } char **format = calloc(snps, sizeof(char *)); format[0] = calloc((long) 20*snps, sizeof(char)); for(i=0; i < snps; i++) { format[i] = format[0]+i*20; } float eps,s; eps=0.6; sst[0]=0; strcat(sst,argv[1]); strcat(sst,".vcf"); pFile = fopen (sst,"r"); for(j = 0; j < header_lines; j ++) { fscanf(pFile,"%[^\n]\n",sst); // printf("%s\n",sst); } for(i = 0; i < start_col; i ++) { fscanf(pFile," %s",sst); // printf("%s\n",sst); } for(i = 0; i < individuals; i ++) { fscanf(pFile," %s",sst); // printf("%s\n",sst); fscanf(pFile,"\n"); } j = 0; while (1) { if (j%1000==0) { printf("Read SNP: %d\n",j); } end=fscanf(pFile,"%hu %u %s %s %s %s %s %s %s",&gh1,&hpp, IsnpName, Imajor, Iminor,Iqual,Ifilter,Iinfo,Iformat); if (end<9) break; //printf("END: %d\n",end); chrom[j] = gh1; pos[j] = hpp; strncat(snpName[j],IsnpName,49); strncat(major[j],Imajor,190); strncat(minor[j],Iminor,190); strncat(qual[j],Iqual,19); strncat(filter[j],Ifilter,19); strncat(info[j],Iinfo,190); strncat(format[j],Iformat,19); // printf("%d %d#######\n",chrom[j],pos[j]); for(i = 0; i < individuals; i ++) { fscanf(pFile," %hu|%hu:%f:%f,%f,%f:%f", &gh1,&gh2,&dsI,&gl1I,&gl2I,&gl3I,&bdI); //printf("%d|%d:%f:%f,%f,%f:%f\n", gh1,gh2,dsI,gl1I,gl2I,gl3I,bdI); g1[i][j] = gh1; g2[i][j] = gh2; ds[i][j] = dsI; gl1[i][j] = gl1I; gl2[i][j] = gl2I; gl3[i][j] = gl3I; bd[i][j] = bdI; fscanf(pFile,"\n"); } j++; } snps= j; fclose (pFile); // check for minor allele and change for(j = 0; j < snps; j ++) { for(ig=0,i = 0; i < individuals; i ++) { if (g1[i][j]>0) { ig++; } if (g2[i][j]>0) { ig++; } } freq[j] = (float) (1.0*ig)/(2.0*individuals); change[j]=0; if (ig>individuals) { change[j]=1; for(i = 0; i < individuals; i ++) { g1[i][j]= 1 - g1[i][j]; g2[i][j]= 1 - g2[i][j]; ds[i][j] = 2.0 - ds[i][j]; s = gl1[i][j]; gl1[i][j] = gl3[i][j]; gl3[i][j] = s; bd[i][j] = 2.0 - bd[i][j]; } } } //check for high probable snps /* for(j = 0; j < snps; j ++) */ /* { */ /* for(i = 0; i < individuals; i ++) { */ /* if (p1[i][j]<0.8) { */ /* p1[i][j]=0.0; */ /* } */ /* if (p2[i][j]<0.8) { */ /* p2[i][j]=0.0; */ /* } */ /* } */ /* } */ sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_annot.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"Individuals: %d\n",individuals); fprintf(pFile,"SNPs : %d\n",snps); for(j = 0; j < snps; j ++) { fprintf(pFile,"%d %d %s %s %s %s %s %s %s %.8f %d\n",chrom[j],pos[j], snpName[j], major[j], minor[j],qual[j],filter[j],info[j],format[j],freq[j],change[j]); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_mat.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",2*individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Haplotype: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { if (g1[i][j]>0){ Lind[ig] = j; Lval[ig] = 1.0; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.1f ",Lval[j]); fprintf(pFile,"\n"); ig=0; for(j = 0; j < snps; j ++) { if (g2[i][j]>0){ Lind[ig] = j; Lval[ig] = 1.0; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.1f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_matG.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Genotype: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { s = g1[i][j]+g2[i][j]; if (s>eps){ Lind[ig] = j; Lval[ig] = s; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.3f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); sst[0]=0; strcat(sst,argv[1]); strcat(sst,"_matD.txt"); pFile = fopen (sst,"w"); fprintf(pFile,"%d\n",individuals); fprintf(pFile,"%d\n",snps); for(i = 0; i < individuals; i ++) { printf("Write Individual Dosage: %d\n",(i+1)); ig=0; for(j = 0; j < snps; j ++) { s = ds[i][j]; if (s>eps){ Lind[ig] = j; Lval[ig] = s; ig++; } } fprintf(pFile,"%d\n",ig); for(j = 0; j < ig; j ++) fprintf(pFile,"%d ",Lind[j]); fprintf(pFile,"\n"); for(j = 0; j < ig; j ++) fprintf(pFile,"%.3f ",Lval[j]); fprintf(pFile,"\n"); } fclose (pFile); return(0); }