#include #include #include #include char* extractUID(char *header) { char *uid = strdup(header + 1); // Skip '>' char *end = strchr(uid, ' '); if (end != NULL) { *end = '\0'; // Terminate at the first space } return uid; } void printStats(char *uid, long long length, long long A_count, long long T_count, long long G_count, long long C_count, FILE *output) { long long total_count = A_count + T_count + G_count + C_count; fprintf(output, "Chromosome UID: %s\n", uid); fprintf(output, "Length: %lld\n", length); fprintf(output, "Nucleotide frequencies:\n"); fprintf(output, "A: % 10lld % 5.2f%%\n", A_count, ((double)A_count / total_count) * 100); fprintf(output, "T: % 10lld % 5.2f%%\n", T_count, ((double)T_count / total_count) * 100); fprintf(output, "G: % 10lld % 5.2f%%\n", G_count, ((double)G_count / total_count) * 100); fprintf(output, "C: % 10lld % 5.2f%%\n", C_count, ((double)C_count / total_count) * 100); fprintf(output, "GC content: %.2f%%\n\n", ((double)(G_count + C_count) / total_count) * 100); } void printFileContent(FILE *file) { int c; rewind(file); while ((c = fgetc(file)) != EOF) { putchar(c); } printf("\n"); } void processFasta(FILE *fasta, FILE *output) { char *line = NULL; size_t len = 0; ssize_t read; char *currentUID = NULL; int in_sequence = 0; long long A_count = 0, T_count = 0, G_count = 0, C_count = 0, length = 0; long long A_global_count = 0, T_global_count = 0, G_global_count = 0, C_global_count = 0, length_global = 0; while ((read = getline(&line, &len, fasta)) != -1) { if (line[0] == '>') { if (in_sequence) { length_global += length; A_global_count += A_count; T_global_count += T_count; G_global_count += G_count; C_global_count += C_count; printf("Processing chromosome: %s\n", currentUID); printStats(currentUID, length, A_count, T_count, G_count, C_count, output); free(currentUID); A_count = T_count = G_count = C_count = length = 0; } currentUID = extractUID(line); in_sequence = 1; } else { for (int i = 0; line[i] != '\0' && line[i] != '\n'; i++) { char c = toupper(line[i]); if (isalpha(c)) { length++; if (c == 'A') { A_count++; } else if (c == 'T') { T_count++; } else if (c == 'G') { G_count++; } else if (c == 'C') { C_count++; } } } } } if (in_sequence) { length_global += length; A_global_count += A_count; T_global_count += T_count; G_global_count += G_count; C_global_count += C_count; printf("Processing chromosome: %s\n", currentUID); printf("A=%lld, G=%lld, C=%lld, T=%lld\n\n", A_count, G_count, C_count, T_count); printStats(currentUID, length, A_count, T_count, G_count, C_count, output); free(currentUID); } fprintf(output, "=============================================\nSummary statistics for the whole human genome.\n"); printStats("WholeGenome", length_global, A_global_count, T_global_count, G_global_count, C_global_count, output); free(line); } int main() { FILE *fasta = fopen("/home/lukaskoz/tmp/lab7_wdi/chm13v2.0.fa", "r"); //FILE *fasta = fopen("/home/lukaskoz/tmp/lab7_wdi/chm13v2.0.fa_5M", "r"); //%M contains 2 chromosomes only FILE *output = fopen("/home/lukaskoz/tmp/lab7_wdi/genome_stats_full.txt", "w"); if (fasta == NULL || output == NULL) { perror("Error opening files"); return 1; } processFasta(fasta, output); fclose(fasta); fclose(output); // Display output file content FILE *outputFile = fopen("/home/lukaskoz/tmp/lab7_wdi/genome_stats_full.txt", "r"); if (outputFile == NULL) { perror("Error opening output file"); return 1; } printf("\n\n=============================================\n\nContent of output file:\n\n"); printFileContent(outputFile); fclose(outputFile); return 0; }