DNA_Encoding_Decoding_System/Encoder.py at main · the-cai-lab/DNA_Encoding_Decoding_System · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228

import random
import pandas as pd
import os.path
import argparse

def text_to_bin(Eng_file):
    """
    Transforms English characters that are stored in a file with the format .txt to binary strings,
    and stores the binary strings in a file.

    Args:
    English_filename (str): The name of the text file that contains English characters to transform.
    Binary_filename (str): The name of the text file to save binary strings in (optional).

    Assumptions:
    -The Eng_file corresponds to a file that exists in the working directory with .txt format.
    -Data stored in the Eng_file is English characters or Western Arabic Numerals.

    """

    # Read the line from the English file
    with open(Eng_file, 'r') as f:
        lines_list= f.readlines()

    binary_string = ''
    # Convert each character to binary and store in the binary file
    for line in lines_list:
        bin_list = [bin(ord(chr)) for chr in line]  # Returns a list of binary strings. English chr >> ASCII value >> Binary string
        for binary in bin_list:
            binary_string = binary_string + binary

    print('The content of the file \033[1;33m{}\033[0m has been converted to binary and returned as a string.'.format(Eng_file))
    return binary_string


def rand_rep(string_length, max_rep):
    """
    The function generates random nucleotide strings by sampling from a list of nucleotides ['A', 'C', 'T', 'G'].
    The length of each string is determined by the 'string_length' parameter, and the maximum repetition
    of each nucleotide in a string is determined by the 'max_rep' parameter.
    The function generates 100,000 strings and ensures that each generated string is unique by checking against
    the existing list of strings.
    The result is printed, displaying the maximum number of unique nucleotide strings that can be generated
    based on the provided parameters.
    The function returns a list containing all the generated unique strings.

    Args:
        string_length (int): The desired length of the generated nucleotide strings.
        max_rep (int): The maximum number of times a nucleotide can be repeated in a string.

    Returns:
        list: A list containing unique random nucleotide strings.

    """
    import random

    n = 0
    strings_list = list()  # List to store generated strings.
    l = ["A", "C", "T", "G"]

    random.seed(35)  # Setting the random seed for reproducibility.

    while n < 100000:
        bases_string = "".join([str(x) for x in random.sample(l, k=string_length, counts=[max_rep]*4)])  # Generating a random string by sampling from the nucleotide list.

        if bases_string not in strings_list:  # Checking if the generated string is already in the list.
            strings_list.append(bases_string)  # Adding the generated string to the list.
        else:
            pass  # If the string is already in the list, do nothing and continue to the next iteration.

        n += 1


    print('Note: The maximum number of nucleotide strings that can be generated with the length of {} and maximum nuc/len of {} is: {}'.format(string_length, max_rep, len(strings_list)))

    return strings_list


def generate_ascii_nuc_table(string_length, max_rep):
    """
    Generates an updated ASCII table with nucleotide codes and saves it as a CSV file.

    Args:
        string_length (int): The desired length of the generated nucleotide strings.
        max_rep (int): The maximum number of times a nucleotide can be repeated in a string.
        table_name (str, optional): The name of the output CSV file. Defaults to 'Nuc_ascii'.

    The function generates random nucleotide strings using the 'rand_rep' function and creates an updated ASCII table.
    It uses a pre-existing ASCII table stored in a CSV file named 'ASCII.csv'.
    The 'ASCII.csv' file should have columns: 'Decimal', 'Binary', 'Octal', 'Hexadecimal', 'Symbol', 'Description'.
    The function reads the 'ASCII.csv' file and adds two new columns: '0bBinary' and 'Nuc_code'.
    The '0bBinary' column contains the binary representation of each ASCII value with the '0b' prefix.
    The 'Nuc_code' column contains unique nucleotide strings generated by the 'rand_rep' function.
    The function ensures that the 'Nuc_code' column contains 128 unique strings, as the ASCII table has 128 characters.
    The updated table is rearranged and contains columns: 'Decimal', 'Binary', 'Octal', 'Hexadecimal',
    '0bBinary', 'Nuc_code', 'Symbol', 'Description'.
    The updated table is then saved as a CSV file with the provided 'table_name'.
    A confirmation message is printed, indicating the name of the saved CSV file.
    """

    # Setting the random seed for reproducibility.
    random.seed(5)

    # Reading the pre-existing ASCII table from 'ASCII.csv'.
    ascii = pd.read_csv("ASCII.csv", sep=",")

    # Prepending '0b' to the binary values.
    binary_with_pre = '0b' + ascii["Binary"].astype(str)

    # Generating random nucleotide strings using the 'rand_rep' function.
    strings = rand_rep(string_length, max_rep)

    nuc_strings = list()
    # Ensuring the 'nuc_strings' list contains 128 unique strings.
    while len(nuc_strings) < 128:
        random_element = random.choice(strings)
        if random_element not in nuc_strings:
            nuc_strings.append(random_element)

    ascii['0bBinary'] = binary_with_pre
    ascii['Nuc_code'] = nuc_strings


    ascii = ascii[['Decimal', 'Binary', 'Octal', 'Hexadecimal', '0bBinary', 'Nuc_code', 'Symbol', 'Description']]

    new_table_name = 'Nuc_ascii_SL{}_MR{}.csv'.format(string_length, max_rep)

    ascii.to_csv(new_table_name, sep=',', index=False)

    print('The updated ASCII table with nucleotide codes has been created and saved as: {}'.format(new_table_name))


def encode(input_filename, output_filename='Encoded_text', string_length=4, max_rep=2):
    """
    Converts binary strings in an input file to nucleotide codes and writes the result to an output file.

    Args:
        input_filename (str): The name of the input file containing binary strings.
        output_filename (str): The name of the output file to write the nucleotide codes.
        string_length (int): The desired length of each nucleotide code.
        max_rep (int): The maximum number of times a nucleotide can be repeated in a string.

    The function reads the table file, which contains an ASCII table with nucleotide codes.
    It reads the input file line by line and extracts the binary strings.
    Each binary string is converted to its corresponding nucleotide code using the already exists or generated table file.
    The nucleotide codes are then written to the output file.
    """


    if string_length < 4:
        print('The string length cannot be less than 4.')
        print('Resetting the string length to 4..')
        string_length = 4

    print('Encoding parameters:')
    print('Nucleotide strings length: \033[1;31m{}\033[0m'.format(string_length))
    print('Maximum repetitions of similar bases per string: \033[1;31m{}\033[0m'.format(max_rep))
    print('--------------------------------------------------')
    table_name = 'Nuc_ascii_SL{}_MR{}.csv'.format(string_length, max_rep)
    path = './{}'.format(table_name)

    # Checking if the updated ASCII table with the input parameters already exists.
    if os.path.isfile(path):
        print('The updated ASCII table with the input parameters already exists in the current directory.')
        print('Skipping generating nucleotide ASCII table..')
    else:
        generate_ascii_nuc_table(string_length, max_rep)

    # Reading the ASCII table with nucleotide codes.
    ascii = pd.read_csv(table_name, sep=",")

    binary_string = text_to_bin(input_filename) # Call function text_to_bin


    count = binary_string.count('b')  # Counting the occurrences of 'b' in the line.

    start_index = binary_string.find('0b') + 2  # Finding the index of the first '0b' occurrence and adding 2 to get the start index.
    end_index = binary_string.find('0b', start_index)  # Finding the index of the next '0b' occurrence.

    bin_list = []

    for n in range(1, count + 1):
        if n == 1:
            bin_list.append(binary_string[start_index - 2:end_index])  # Extracting the binary string from the line.

        elif n != count:
            start_index = end_index + 2  # Updating the start index for the next binary string.
            end_index = binary_string.find('0b', start_index)  # Updating the end index for the next binary string.
            bin_list.append(binary_string[start_index - 2:end_index])

        elif n == count:
            start_index = end_index + 2
            bin_list.append(binary_string[start_index - 2:])

    output_file = '{}_SL{}_MR{}.txt'.format(output_filename, string_length, max_rep)
    # Clearing the content of the output file (if it exists) or creating a new file.
    open(output_file, 'w').close()

    for x in bin_list:
        # Finding the corresponding nucleotide code for the binary string in the ASCII table.
        nucleotide_code = ascii.loc[ascii['0bBinary'] == x, 'Nuc_code'].values[0]

        with open(output_file.format(output_filename), 'a') as f:
            f.write(nucleotide_code)  # Writing the nucleotide
    print('The length of the encoded text is \033[1;34m{}\033[0m DNA bases.'.format(len(bin_list)*string_length))
    print('The text has been successfully encoded and saved in file: \033[1;32m{}\033[0m'.format(output_file))


parser = argparse.ArgumentParser(description="Converts binary strings in an input file to nucleotide codes and writes the result to an output file.")

parser.add_argument('-a', '--english_file_name', required=True, type=str, metavar='', help='The name of the file that contains the English text to be encoded.')
parser.add_argument('-b', '--output_file_name', required=False, type=str, metavar='', default='Encoded_text', help="The name of the output file. Do not enter the file name with .txt extension.")
parser.add_argument('-l', '--string_length', required=False, type=int, metavar='', default=4 ,help='The length of the nucleotide strings that map to ASCII binary values. The default value is 4.')
parser.add_argument('-r', '--maximum_repetitions', required=False, type=int, metavar='', default=2, help='The maximum number of times a nucleotide can be repeated in a string. The default value is 2')

args = parser.parse_args()

if __name__ == "__main__":
    encode(args.english_file_name, args.output_file_name, args.string_length, args.maximum_repetitions)