ad 1 in colab: !pip install Bio from Bio.Align import substitution_matrices # Print available matrices print(substitution_matrices.load()) ad 2 Jukes-Cantor model, discrete time a) jukes_cantor_one_step(mu) #single parameter, all frequencies in the matrix the same def jukes_cantor_one_step(mu): # fill with off-diagonal values matrix = np.full((4, 4), mu / 4) # set the diagonal for i in range(4): matrix[i, i] = 1 - 3 * mu / 4 return matrix def jukes_cantor_substitution_matrix(mu, t): # fill with off-diagonal values matrix = np.full((4, 4), 1/4 - 1/4*np.exp(-4*mu*t)) # set the diagonal for i in range(4): matrix[i, i] = 1/4 + 3/4*np.exp(-4*mu*t) return matrix jukes_cantor_one_step(0.1) array([[0.925, 0.025, 0.025, 0.025], [0.025, 0.925, 0.025, 0.025], [0.025, 0.025, 0.925, 0.025], [0.025, 0.025, 0.025, 0.925]]) jukes_cantor_one_step(0.01) array([[0.9925, 0.0025, 0.0025, 0.0025], [0.0025, 0.9925, 0.0025, 0.0025], [0.0025, 0.0025, 0.9925, 0.0025], [0.0025, 0.0025, 0.0025, 0.9925]]) jukes_cantor_one_step(1e-06) #see lecture, real example array([[9.9999925e-01, 2.5000000e-07, 2.5000000e-07, 2.5000000e-07], [2.5000000e-07, 9.9999925e-01, 2.5000000e-07, 2.5000000e-07], [2.5000000e-07, 2.5000000e-07, 9.9999925e-01, 2.5000000e-07], [2.5000000e-07, 2.5000000e-07, 2.5000000e-07, 9.9999925e-01]]) b) Jukes-Cantor model, continuous time jukes_cantor_substitution_matrix(mu, t) jukes_cantor_substitution_matrix(0.1, 1) array([[0.75274003, 0.08241999, 0.08241999, 0.08241999], [0.08241999, 0.75274003, 0.08241999, 0.08241999], [0.08241999, 0.08241999, 0.75274003, 0.08241999], [0.08241999, 0.08241999, 0.08241999, 0.75274003]]) jukes_cantor_substitution_matrix(0.1, 10) array([[0.26373673, 0.24542109, 0.24542109, 0.24542109], [0.24542109, 0.26373673, 0.24542109, 0.24542109], [0.24542109, 0.24542109, 0.26373673, 0.24542109], [0.24542109, 0.24542109, 0.24542109, 0.26373673]]) jukes_cantor_substitution_matrix(0.01, 1) array([[0.97059208, 0.00980264, 0.00980264, 0.00980264], [0.00980264, 0.97059208, 0.00980264, 0.00980264], [0.00980264, 0.00980264, 0.97059208, 0.00980264], [0.00980264, 0.00980264, 0.00980264, 0.97059208]]) jukes_cantor_substitution_matrix(0.01, 10) array([[0.75274003, 0.08241999, 0.08241999, 0.08241999], [0.08241999, 0.75274003, 0.08241999, 0.08241999], [0.08241999, 0.08241999, 0.75274003, 0.08241999], [0.08241999, 0.08241999, 0.08241999, 0.75274003]]) jukes_cantor_substitution_matrix(0.01, 100) array([[0.26373673, 0.24542109, 0.24542109, 0.24542109], [0.24542109, 0.26373673, 0.24542109, 0.24542109], [0.24542109, 0.24542109, 0.26373673, 0.24542109], [0.24542109, 0.24542109, 0.24542109, 0.26373673]]) For detailed explanation why array values approach 0.25 read: https://en.wikipedia.org/wiki/Models_of_DNA_evolution#JC69_model_(Jukes_and_Cantor_1969) Additional, functions could look like: kimura_substitution_matrix(alpha, beta, t) #instead single parameter (mu), here we have alpha for transitions, beta for transversions def kimura_substitution_matrix(alpha, beta, t): """Kimura (K80) substitution matrix generator""" matrix = np.full((4, 4), 0.25 - 0.25 * np.exp(-2 * (alpha + beta) * t)) # Transversions diag = 0.25 + 0.5 * np.exp(-4 * beta * t) + 0.25 * np.exp(-2 * (alpha + beta) * t) # No substitution transition = 0.25 + 0.25 * np.exp(-2 * (alpha + beta) * t) - 0.5 * np.exp(-4 * beta * t) # Transitions for i in range(4): matrix[i, i] = diag # Set diagonal # Assign transitions (AâG and CâT) matrix[0, 2] = matrix[2, 0] = transition # A â G matrix[1, 3] = matrix[3, 1] = transition # C â T return matrix kimura_substitution_matrix(0.1, 0.05, 1) #alpha tends to be higher than beta array([[0.84456993, 0.06479544, 0.02583918, 0.06479544], [0.06479544, 0.84456993, 0.06479544, 0.02583918], [0.02583918, 0.06479544, 0.84456993, 0.06479544], [0.06479544, 0.02583918, 0.06479544, 0.84456993]]) see also: https://en.wikipedia.org/wiki/Models_of_DNA_evolution#K80_model_(Kimura_1980) felsenstein_substitution_matrix(mu, t, pi) # the main difference is that here we use the frequencies that are taken from the data, it means that the matiric does not heve to # be symetrical, but it is still ok pi = np.array([0.29, 0.18, 0.31, 0.22]) # Example unequal base frequencies for A, C, G, T felsenstein_substitution_matrix(0.01, 1, pi) array([[0.9721605 , 0.29 , 0.29 , 0.29 ], [0.18 , 0.96784734, 0.18 , 0.18 ], [0.31 , 0.31 , 0.97294471, 0.31 ], [0.22 , 0.22 , 0.22 , 0.96941576]]) see also: https://en.wikipedia.org/wiki/Models_of_DNA_evolution#F81_model_(Felsenstein_1981) thus our markov chain function using jukes_cantor_substitution_matrix can look like: markov_jc(seq, mu, t) where seq - sequence mu - frequency of mutations t - time Inside of jc() function given mu and t, we create jukes_cantor_substitution_matrix(0.01, 1), and then we go through the whole sequence and mutate accoding it ======================================================== Extra reading: TreeTime: Maximum-likelihood phylodynamic analysis https://doi.org/10.1093/ve/vex042 https://github.com/neherlab/treetime/blob/master/treetime/nuc_models.py Markov Chains in Python: Beginner Tutorial https://www.datacamp.com/tutorial/markov-chains-python-tutorial