import numpy as np
import pandas as pd
from numba import njit
from scipy.linalg import expm
from scipy.special import comb
from itrails.cutpoints import cutpoints_AB, cutpoints_ABC
[docs]
def rate_mat_JC69(mu):
"""
This function returns the rate matrix for the JC69 model.
Parameters
----------
mu : numeric
Mutation rate
"""
return np.full((4, 4), mu / 4) - np.diag([mu, mu, mu, mu])
[docs]
def p_b_given_a(t, Q):
"""
This function calculates the probability of observing the
nucleotide b given a, t and Q. a is the starting nucleotide,
while b is the end nucleotide. t is the total time of the interval.
P(b = bb | a == aa, Q, t)
Parameters
----------
t : numeric
Total time of the interval (from a/b to c)
Q : numpy array
A 4x4 rate matrix for any substitution model
"""
nt = ["A", "G", "C", "T"]
mat = np.zeros((4, 4))
for i in range(len(t)):
mat = mat + t[i] * Q[i]
arr = np.empty((4**2, 3))
acc = 0
mat = expm(mat)
for aa in range(4):
for bb in range(4):
arr[acc] = [aa, bb, mat[aa, bb]]
acc += 1
df = pd.DataFrame(arr, columns=["a", "b", "prob"])
df["a"] = [nt[int(i)] for i in df["a"]]
df["b"] = [nt[int(i)] for i in df["b"]]
return df
[docs]
@njit
def JC69_analytical_integral(aa, bb, cc, dd, t, mu, k):
"""
This function calculates the probability of observing the
nucleotides bb, cc and dd given aa, t and mu. aa and bb are the starting
nucleotides, while cc is the end nucleotide. dd is the nucleotide at
the time of coalescent. t is the total time of the interval. The
returned value corresponds to integrating the coalescent to d over
the entirety of t.
P(b = bb, c == cc, d == dd | a == aa, mu, t)
c ^
| |
__d__ | t
| | |
a b |
Parameters
----------
aa, bb, cc, dd : integer or string
nucleotide at a, b, c and d respectively
t : numeric
Total time of the interval (from a/b/c to d)
mu : numeric
The mutation rate for the JC69 model
k : numeric
The coalescent rate
"""
alpha = 3 / 4 if aa == dd else -1 / 4
beta = 3 / 4 if dd == bb else -1 / 4
gamma = 3 / 4 if dd == cc else -1 / 4
res = (
k
* (
((-1 + np.exp(k * t)) * (np.exp(mu * t) + 16 * (alpha + beta) * gamma))
/ (np.exp((k + mu) * t) * k)
+ 4
* (
gamma / (np.exp(k * t) * (-k + mu))
+ (alpha + beta) / (k + mu)
- (alpha + beta) / (np.exp((k + mu) * t) * (k + mu))
+ (4 * alpha * beta) / (k + 2 * mu)
+ (gamma * ((k - mu) ** (-1) + (16 * alpha * beta) / (k + mu)))
/ np.exp(mu * t)
+ (4 * alpha * beta * ((-4 * gamma) / (k + mu) - (k + 2 * mu) ** (-1)))
/ np.exp((k + 2 * mu) * t)
)
)
) / (64 * (1 - np.exp(-(k * t))))
return res
[docs]
def p_b_c_given_a_JC69_analytical(t, mu, k):
"""
This function returns a data frame with the values of
P(b, c | a) for all combinations of nucleotides.
Parameters
----------
t : numeric
Total time of the interval (from a/b/c to d)
mu : numeric
The mutation rate for the JC69 model
k : numeric
The coalescent rate
"""
nt = ["A", "G", "C", "T"]
arr = np.empty((4**3, 4))
acc = 0
for aa in range(4):
for bb in range(4):
for cc in range(4):
cumsum = 0
for dd in range(4):
cumsum += JC69_analytical_integral(aa, bb, cc, dd, t, mu, k)
arr[acc] = [aa, bb, cc, cumsum]
acc += 1
df = pd.DataFrame(arr, columns=["a", "b", "c", "prob"])
df["a"] = [nt[int(i)] for i in df["a"]]
df["b"] = [nt[int(i)] for i in df["b"]]
df["c"] = [nt[int(i)] for i in df["c"]]
return df
[docs]
@njit
def JC69_analytical_integral_double(aa, bb, cc, dd, ee, ff, t, mu):
"""
This function calculates the probability of observing the
nucleotides bb, cc, dd, ee and ff given aa, t and mu. aa, bb
and cc are the starting nucleotides, while dd is the end nucleotide.
ee is the nucleotide at the time of the first coalescent, while
ff is the nucleotide at the time of the second coalescent. t is the
total time of the interval. The returned value corresponds to integrating
the coalescent to e and f over the entirety of t.
Note that the coalescent rate is always 1 between two sequences, and 3
between 3 sequences.
P(b = bb, c == cc, d == dd, e == ee, f == ff | a == aa, Q, t)
d ^
| |
___f___ |
| | | t
__e__ | |
| | | |
a b c |
Parameters
----------
aa, bb, cc, dd, ee, ff : integer or string
nucleotide at a, b, c, d, e and f, respectively
t : numeric
Total time of the interval (from a/b/c to d)
mu : numeric
The mutation rate for the JC69 model
"""
alpha = 3 / 4 if aa == ee else -1 / 4
beta = 3 / 4 if ee == bb else -1 / 4
gamma = 3 / 4 if ee == ff else -1 / 4
delta = 3 / 4 if ff == cc else -1 / 4
epsilon = 3 / 4 if ff == dd else -1 / 4
res = (
3
* (
(-2 * delta * (-2 - 8 * gamma + mu)) / (-6 + mu + mu**2)
- (32 * alpha * beta * delta * (2 + mu + 8 * gamma * (1 + mu)))
/ (3 * (1 + mu) ** 2 * (2 + mu))
- (32 * alpha * beta * epsilon * (2 + mu + 8 * gamma * (1 + mu)))
/ (np.exp(mu * t) * (1 + mu) * (2 + mu) * (3 + mu))
- (
8
* alpha
* beta
* (1 + (16 * delta * epsilon) / np.exp(mu * t))
* (2 + mu + 8 * gamma * (1 + mu))
)
/ ((1 + mu) * (2 + mu) * (3 + 2 * mu))
+ (
16
* delta
* gamma
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
)
/ ((-2 + mu) * (2 + mu) * (1 + 2 * mu))
- (
4
* (alpha + beta)
* (1 + 2 * gamma * (2 + mu))
* (
(3 + 2 * mu) * (3 * np.exp(mu * t) + 4 * epsilon * (3 + mu))
+ 12
* delta
* (np.exp(mu * t) * (3 + mu) + 4 * epsilon * (3 + 2 * mu))
)
)
/ (3 * np.exp(mu * t) * (2 + mu) * (3 + mu) * (3 + 2 * mu))
- (
2
* epsilon
* (
(2 + 8 * gamma - mu) / ((-3 + mu) * (-2 + mu))
+ (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
/ ((-1 + mu) * (1 + mu) * (2 + mu))
)
)
/ np.exp(mu * t)
- (
-16 * delta * epsilon * (2 + 8 * gamma - mu) * (2 + 3 * mu + mu**2)
+ np.exp(mu * t) * (-2 - 8 * gamma + mu) * (2 + 3 * mu + mu**2)
- 3
* np.exp(mu * t)
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
- 48
* epsilon
* (
2
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ delta
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
)
)
/ (6 * np.exp(mu * t) * (-2 + mu) * (1 + mu) * (2 + mu))
+ (
2
* (
2
* np.exp(mu * t)
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ delta
* (
32
* epsilon
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ np.exp(mu * t)
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
)
)
)
/ (np.exp(mu * t) * (1 + mu) ** 2 * (-4 + mu**2))
+ (
(32 * alpha * beta * delta * (2 + mu + 8 * gamma * (1 + mu)))
/ (3 * (1 + mu) ** 2 * (2 + mu))
+ (
32
* alpha
* beta
* np.exp(mu * t)
* epsilon
* (2 + mu + 8 * gamma * (1 + mu))
)
/ ((1 + mu) * (2 + mu) * (3 + mu))
+ (
8
* alpha
* beta
* np.exp(mu * t)
* (1 + (16 * delta * epsilon) / np.exp(mu * t))
* (2 + mu + 8 * gamma * (1 + mu))
)
/ ((1 + mu) * (2 + mu) * (3 + 2 * mu))
+ (
4
* (alpha + beta)
* (1 + 2 * gamma * (2 + mu))
* (
(3 + 2 * mu)
* (
3 * np.exp(2 * mu * t)
+ 4 * np.exp(2 * mu * t) * epsilon * (3 + mu)
)
+ 12
* delta
* (
np.exp(mu * t) * (3 + mu)
+ 4 * np.exp(mu * t) * epsilon * (3 + 2 * mu)
)
)
)
/ (3 * (2 + mu) * (3 + mu) * (3 + 2 * mu))
+ np.exp(2 * (1 + mu) * t)
* (
(2 * delta * (-2 - 8 * gamma + mu))
/ (np.exp(2 * t) * (-6 + mu + mu**2))
- (
16
* delta
* gamma
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
)
/ (np.exp(mu * t) * (-2 + mu) * (2 + mu) * (1 + 2 * mu))
+ 2
* np.exp(mu * t)
* epsilon
* (
(2 + 8 * gamma - mu) / (np.exp(2 * t) * (-3 + mu) * (-2 + mu))
+ (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
/ ((-1 + mu) * (1 + mu) * (2 + mu))
)
+ (
-16
* delta
* epsilon
* (2 + 8 * gamma - mu)
* (2 + 3 * mu + mu**2)
+ np.exp(mu * t) * (-2 - 8 * gamma + mu) * (2 + 3 * mu + mu**2)
- 3
* np.exp(2 * t + mu * t)
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
- 48
* np.exp(2 * t)
* epsilon
* (
2
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ delta
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
)
)
/ (6 * np.exp(2 * t) * (-2 + mu) * (1 + mu) * (2 + mu))
- (
2
* (
2
* np.exp(mu * t)
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ delta
* (
32
* epsilon
* gamma
* (1 + mu)
* (
(-1 + 2 * beta * (-2 + mu)) * (2 + mu)
+ 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
)
+ np.exp(mu * t)
* (-2 + mu)
* (
(1 + mu) * (2 + 8 * beta + mu)
+ 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
)
)
)
)
/ (np.exp(mu * t) * (1 + mu) ** 2 * (-4 + mu**2))
)
)
/ np.exp(3 * (1 + mu) * t)
)
) / (1024 * (1 + 0.5 / np.exp(3 * t) - 1.5 / np.exp(t)))
return res
[docs]
def p_b_c_d_given_a_JC69_analytical(t, mu):
"""
This function returns a data frame with the values of
P(b, c, d | a) for all combinations of nucleotides.
Parameters
----------
t : numeric
Total time of the interval (from a/b/c to d)
mu : numeric
The mutation rate for the JC69 model
"""
nt = ["A", "G", "C", "T"]
arr = np.empty((4**4, 5))
acc = 0
for aa in range(4):
for bb in range(4):
for cc in range(4):
for dd in range(4):
cumsum = 0
for ee in range(4):
for ff in range(4):
res = JC69_analytical_integral_double(
aa, bb, cc, dd, ee, ff, t, mu
)
cumsum += res
arr[acc] = [aa, bb, cc, dd, cumsum]
acc += 1
df = pd.DataFrame(arr, columns=["a", "b", "c", "d", "prob"])
df["a"] = [nt[int(i)] for i in df["a"]]
df["b"] = [nt[int(i)] for i in df["b"]]
df["c"] = [nt[int(i)] for i in df["c"]]
df["d"] = [nt[int(i)] for i in df["d"]]
return df
[docs]
def b_c_d_given_a_to_dict_a_b_c_d(df):
"""
This function converts the data frame as outputted
by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
into a dictionary. How to use the dictionary:
P(b, c, d | a) = dct[a][b][c][d]
Parameters
----------
df : data frame
As outputted by p_b_c_given_a_double_coal
"""
# df = df.groupby(['a', 'b', 'c', 'd']).sum().reset_index()
df = (
df.groupby(["a", "b", "c"])
.apply(lambda x: dict(zip(x.d, x.prob)))
.reset_index()
)
df.columns = ["a", "b", "c", "val"]
df = df.groupby(["a", "b"]).apply(lambda x: dict(zip(x.c, x.val))).reset_index()
df.columns = ["a", "b", "val"]
df = df.groupby("a").apply(lambda x: dict(zip(x.b, x.val))).to_dict()
return df
[docs]
def b_c_given_a_to_dict_a_b_c(df):
"""
This function converts the data frame as outputted
by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
into a dictionary. How to use the dictionary:
P(b, c | a) = dct[a][b][c]
Parameters
----------
df : data frame
As outputted by p_b_c_given_a_single_coal
"""
# df = df.groupby(['a', 'b', 'c']).sum().reset_index()
df = df.groupby(["a", "b"]).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
df.columns = ["a", "b", "val"]
df = df.groupby("a").apply(lambda x: dict(zip(x.b, x.val))).to_dict()
return df
[docs]
def b_given_a_to_dict_a_b(df):
"""
This function converts the data frame as outputted
by p_b_given_a into a dictionary. How to use the dictionary:
P(b | a) = dct[a][b]
Parameters
----------
df : data frame
As outputted by p_b_given_a
"""
return df.groupby(["a"]).apply(lambda x: dict(zip(x.b, x.prob))).to_dict()
[docs]
def calc_emissions_single_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
c0_c1_t_vec,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
c0_c1_mu_vec,
d0_abc0_mu_vec,
coal_rate_1,
coal_rate_2,
):
"""
This function returns the emission probabilities of a hidden
state contining two coalescent events at different time intervals.
_________
| |
---------abc0----- |
____|___ |
| | |
----ab1-------c1-- |
| | |
----ab0----- | |
__|__ | |
| | | |
--a1----b1-- | |
| | | |
a0 b0 c0 d0
Parameters
----------
a0_a1_t_vec, b0_b1_t_vec, c0_c1_t_vec,
d0_abc0_t_vec, ab0_ab1_t_vec : numeric list
Each list contains the interval time for a site to mutate
with a certain mutation rate, specified by *mu_vec
a1b1_ab0_t, ab1c1_abc0_t : numeric
Time interval when the first and the second coalescent
can happen, respectively.
a0_a1_mu_vec, b0_b1_mu_vec, c0_c1_mu_vec,
d0_abc0_mu_vec, ab0_ab1_mu_vec : numeric list
Each list contains the mutation rates for each interval
defined in *t_vec
a1b1_ab0_mu, ab1c1_abc0_mu : numeric
Mutation rates for the first and second coalescent intervals,
respectively.
coal_rate_1, coal_rate_2 : numeric
Coalescent rate of the first and second coalescent events,
respectively.
"""
# a0 to a1
Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
df_a = p_b_given_a(t=a0_a1_t_vec, Q=Q_vec)
df_a = b_given_a_to_dict_a_b(df_a)
# df_a[a0][a1]
# b1 to b0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
df_b = p_b_given_a(t=list(reversed(b0_b1_t_vec)), Q=Q_vec)
df_b = b_given_a_to_dict_a_b(df_b)
# df_b[b1][b0]
# c1 to c0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
df_c = p_b_given_a(t=list(reversed(c0_c1_t_vec)), Q=Q_vec)
df_c = b_given_a_to_dict_a_b(df_c)
# df_c[c1][c0]
# abc0 to d0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
df_d = p_b_given_a(t=list(reversed(d0_abc0_t_vec)), Q=Q_vec)
df_d = b_given_a_to_dict_a_b(df_d)
# df_d[abc0][d0]
# ab0 to ab1
Q_vec = [rate_mat_JC69(i) for i in ab0_ab1_mu_vec]
df_ab = p_b_given_a(t=ab0_ab1_t_vec, Q=Q_vec)
df_ab = b_given_a_to_dict_a_b(df_ab)
# df_ab[ab0][ab1]
# First coalescent
df_first = p_b_c_given_a_JC69_analytical(
t=a1b1_ab0_t, mu=a1b1_ab0_mu, k=coal_rate_1
)
df_first = b_c_given_a_to_dict_a_b_c(df_first)
# df_first[a1][b1][ab0]
# Second coalescent
df_second = p_b_c_given_a_JC69_analytical(
t=ab1c1_abc0_t, mu=ab1c1_abc0_mu, k=coal_rate_2
)
df_second = b_c_given_a_to_dict_a_b_c(df_second)
# df_second[ab1][c1][abc0]
emissions = {}
for a0 in ["A", "C", "T", "G"]:
for b0 in ["A", "C", "T", "G"]:
for c0 in ["A", "C", "T", "G"]:
for d0 in ["A", "C", "T", "G"]:
acc = 0
for a1 in ["A", "C", "T", "G"]:
for b1 in ["A", "C", "T", "G"]:
for c1 in ["A", "C", "T", "G"]:
for ab0 in ["A", "C", "T", "G"]:
for ab1 in ["A", "C", "T", "G"]:
for abc0 in ["A", "C", "T", "G"]:
res = 1
res = res * df_a[a0][a1]
res = res * df_b[b1][b0]
res = res * df_first[a1][b1][ab0]
res = res * df_ab[ab0][ab1]
res = res * df_second[ab1][c1][abc0]
res = res * df_c[c1][c0]
res = res * df_d[abc0][d0]
acc += res
emissions[a0 + b0 + c0 + d0] = acc / 4
return emissions
[docs]
def calc_emissions_double_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
c0_c1_t_vec,
a1b1c1_abc0_t,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
c0_c1_mu_vec,
a1b1c1_abc0_mu,
d0_abc0_mu_vec,
):
"""
This function returns the emission probabilities of a hidden
state contining two coalescent events at the same time interval.
_________
| |
---------abc0----- |
____|___ |
__|__ | |
| | | |
--a1----b1----c1-- |
| | | |
a0 b0 c0 d0
Parameters
----------
a0_a1_t_vec, b0_b1_t_vec, c0_c1_t_vec, d0_abc0_t_vec : numeric list
Each list contains the interval time for a site to mutate
with a certain mutation rate, specified by *mu_vec
a1b1c1_abc0_t : numeric
Time interval for the coalescent events to happen.
a0_a1_mu_vec, b0_b1_mu_vec, c0_c1_mu_vec, d0_abc0_mu_vec : numeric list
Each list contains the mutation rates for each interval
defined in *t_vec
a1b1c1_abc0_mu : numeric
Mutation rates for the interval where coalescents happen.
"""
# a0 to a1
Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
df_a = p_b_given_a(t=a0_a1_t_vec, Q=Q_vec)
df_a = b_given_a_to_dict_a_b(df_a)
# df_a[a0][a1]
# b1 to b0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
df_b = p_b_given_a(t=list(reversed(b0_b1_t_vec)), Q=Q_vec)
df_b = b_given_a_to_dict_a_b(df_b)
# df_b[b1][b0]
# c1 to c0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
df_c = p_b_given_a(t=list(reversed(c0_c1_t_vec)), Q=Q_vec)
df_c = b_given_a_to_dict_a_b(df_c)
# df_c[c1][c0]
# abc0 to d0
Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
df_d = p_b_given_a(t=list(reversed(d0_abc0_t_vec)), Q=Q_vec)
df_d = b_given_a_to_dict_a_b(df_d)
# df_d[abc0][d0]
# Double coalescent
df_double = p_b_c_d_given_a_JC69_analytical(t=a1b1c1_abc0_t, mu=a1b1c1_abc0_mu)
df_double = b_c_d_given_a_to_dict_a_b_c_d(df_double)
# df_double[a1][b1][c1][abc0]
emissions = {}
for a0 in ["A", "C", "T", "G"]:
for b0 in ["A", "C", "T", "G"]:
for c0 in ["A", "C", "T", "G"]:
for d0 in ["A", "C", "T", "G"]:
acc = 0
for a1 in ["A", "C", "T", "G"]:
for b1 in ["A", "C", "T", "G"]:
for c1 in ["A", "C", "T", "G"]:
for abc0 in ["A", "C", "T", "G"]:
res = 1
res = res * df_a[a0][a1]
res = res * df_b[b1][b0]
res = res * df_c[c1][c0]
res = res * df_double[a1][b1][c1][abc0]
res = res * df_d[abc0][d0]
acc += res
emissions[a0 + b0 + c0 + d0] = acc / 4
return emissions
[docs]
def get_emission_prob_mat_introgression(
t_A,
t_B,
t_AB,
t_C,
t_upper,
t_out,
t_m,
rho_A,
rho_B,
rho_AB,
rho_C,
rho_ABC,
coal_A,
coal_B,
coal_AB,
coal_BC,
coal_C,
coal_ABC,
n_int_AB,
n_int_ABC,
mu_A,
mu_B,
mu_C,
mu_D,
mu_AB,
mu_ABC,
cut_AB,
cut_ABC,
):
"""
This function returns the emission probabilities of all hidden states
given a set of population genetics parameters.
# | |
# | ABC |\ \
# | AB |\ \ \ \
# / /\ \ \ \ \ \
# / / \ \ \ \ \ \
# A B C D
Parameters
----------
t_A : numeric
Time between present time and the first speciation time for
species A.
t_B : numeric
Time between present time and the migration event for
species A.
t_AB : numeric
Time between speciation events.
t_C : numeric
Time between present time and the migration event for
species C.
t_upper : numeric
Time between the last ABC interval and the third speciation time.
t_peak : numeric
Mean divergence time between ABC and D after the third speciation time.
It should be 4*coal_ABC (or it can be estimated instead).
rho_A, rho_B, rho_AB, rho_C, rho_ABC : numeric
Recombination rates for the A, B, AB, C and ABC intervals.
coal_A, coal_B, coal_AB, coal_C, coal_ABC : numeric
Coalescent rates for the A, B, AB, C and ABC intervals.
n_int_AB, n_int_ABC : integer
Number of intervals in the AB and ABC parts of the tree.
mu_A, mu_B, mu_C, mu_D, mu_AB, mu_ABC : numeric
Mutation rate for the A, B, C, D, AB and ABC intervals.
"""
n_markov_states = (
2 * n_int_AB * n_int_ABC + n_int_ABC * 3 + 3 * comb(n_int_ABC, 2, exact=True)
)
cut_BC = np.concatenate([[0], (cut_AB[1:] + t_m)])
probs = np.empty((n_markov_states), dtype=object)
states = np.empty((n_markov_states), dtype=object)
# Deep coalescence, two single coalescents
acc = 0
for i in range(n_int_ABC):
for j in range(i + 1, n_int_ABC):
a0_a1_t_vec = [t_A, t_AB, cut_ABC[i]]
a0_a1_mu_vec = [mu_A, mu_AB, mu_ABC]
b0_b1_t_vec = [t_B + t_m, t_AB, cut_ABC[i]]
b0_b1_mu_vec = [mu_B, mu_AB, mu_ABC]
c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[i]]
c0_c1_mu_vec = [mu_C, mu_ABC]
ab0_ab1_t_vec = [cut_ABC[j] - cut_ABC[i + 1]]
ab0_ab1_mu_vec = [mu_ABC]
a1b1_ab0_t = cut_ABC[i + 1] - cut_ABC[i]
a1b1_ab0_mu = mu_ABC
ab1c1_abc0_t = (
(cut_ABC[j + 1] - cut_ABC[j]) if j != (n_int_ABC - 1) else t_upper
)
ab1c1_abc0_mu = mu_ABC
add = (
t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
if j != (n_int_ABC - 1)
else 0
)
# d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
d0_abc0_t_vec = [t_out] + [add]
# d0_abc0_mu_vec = [mu_D, mu_ABC]
d0_abc0_mu_vec = [mu_D, mu_ABC]
# V1 states
emissions = calc_emissions_single_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
c0_c1_t_vec,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
c0_c1_mu_vec,
d0_abc0_mu_vec,
coal_ABC,
coal_ABC,
)
states[acc] = (1, i, j)
probs[acc] = emissions
acc += 1
# V2 states
emissions = calc_emissions_single_JC69(
a0_a1_t_vec,
c0_c1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
b0_b1_t_vec,
d0_abc0_t_vec,
a0_a1_mu_vec,
c0_c1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
b0_b1_mu_vec,
d0_abc0_mu_vec,
coal_ABC,
coal_ABC,
)
new_emissions = {}
for k in list(emissions.keys()):
new_emissions[k[0] + k[2] + k[1] + k[3]] = emissions[k]
states[acc] = (2, i, j)
probs[acc] = new_emissions
acc += 1
# V3 states
emissions = calc_emissions_single_JC69(
b0_b1_t_vec,
c0_c1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
a0_a1_t_vec,
d0_abc0_t_vec,
b0_b1_mu_vec,
c0_c1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
a0_a1_mu_vec,
d0_abc0_mu_vec,
coal_ABC,
coal_ABC,
)
new_emissions = {}
for k in list(emissions.keys()):
new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
states[acc] = (3, i, j)
probs[acc] = new_emissions
acc += 1
# Deep coalescence, one double coalescent
for i in range(n_int_ABC):
a0_a1_t_vec = [t_A, t_AB, cut_ABC[i]]
a0_a1_mu_vec = [mu_A, mu_AB, mu_ABC]
b0_b1_t_vec = [t_B + t_m, t_AB, cut_ABC[i]]
b0_b1_mu_vec = [mu_B, mu_AB, mu_ABC]
c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[i]]
c0_c1_mu_vec = [mu_C, mu_ABC]
a1b1c1_abc0_t = (
(cut_ABC[i + 1] - cut_ABC[i]) if i != (n_int_ABC - 1) else t_upper
)
a1b1c1_abc0_mu = mu_ABC
add = (
t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[i + 1]
if i != (n_int_ABC - 1)
else 0
)
# d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
d0_abc0_t_vec = [t_out] + [add]
# d0_abc0_mu_vec = [mu_D, mu_ABC]
d0_abc0_mu_vec = [mu_D, mu_ABC]
# V1 states
emissions = calc_emissions_double_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
c0_c1_t_vec,
a1b1c1_abc0_t,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
c0_c1_mu_vec,
a1b1c1_abc0_mu,
d0_abc0_mu_vec,
)
markov = (1, i, i)
states[acc] = markov
probs[acc] = emissions
acc += 1
# V2 states
emissions = calc_emissions_double_JC69(
a0_a1_t_vec,
c0_c1_t_vec,
b0_b1_t_vec,
a1b1c1_abc0_t,
d0_abc0_t_vec,
a0_a1_mu_vec,
c0_c1_mu_vec,
b0_b1_mu_vec,
a1b1c1_abc0_mu,
d0_abc0_mu_vec,
)
new_emissions = {}
for k in list(emissions.keys()):
new_emissions[k[0] + k[2] + k[1] + k[3]] = emissions[k]
markov = (2, i, i)
states[acc] = markov
probs[acc] = new_emissions
acc += 1
# V3 states
emissions = calc_emissions_double_JC69(
b0_b1_t_vec,
c0_c1_t_vec,
a0_a1_t_vec,
a1b1c1_abc0_t,
d0_abc0_t_vec,
b0_b1_mu_vec,
c0_c1_mu_vec,
a0_a1_mu_vec,
a1b1c1_abc0_mu,
d0_abc0_mu_vec,
)
new_emissions = {}
for k in list(emissions.keys()):
new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
markov = (3, i, i)
states[acc] = markov
probs[acc] = new_emissions
acc += 1
# V0 states
for i in range(n_int_AB):
for j in range(n_int_ABC):
a0_a1_t_vec = [t_A, cut_AB[i]]
a0_a1_mu_vec = [mu_A, mu_AB]
b0_b1_t_vec = [t_B + t_m, cut_AB[i]]
b0_b1_mu_vec = [mu_B, mu_AB]
c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[j]]
c0_c1_mu_vec = [mu_C, mu_ABC]
ab0_ab1_t_vec = [t_AB - cut_AB[i + 1], cut_ABC[j]]
ab0_ab1_mu_vec = [mu_AB, mu_ABC]
a1b1_ab0_t = cut_AB[i + 1] - cut_AB[i]
a1b1_ab0_mu = mu_AB
ab1c1_abc0_t = (
cut_ABC[j + 1] - cut_ABC[j] if j != (n_int_ABC - 1) else t_upper
)
ab1c1_abc0_mu = mu_ABC
add = (
t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
if j != (n_int_ABC - 1)
else 0
)
# d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
d0_abc0_t_vec = [t_out] + [add]
# d0_abc0_mu_vec = [mu_D, mu_ABC]
d0_abc0_mu_vec = [mu_D, mu_ABC]
emissions = calc_emissions_single_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
c0_c1_t_vec,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
c0_c1_mu_vec,
d0_abc0_mu_vec,
coal_AB,
coal_ABC,
)
states[acc] = (0, i, j)
probs[acc] = emissions
acc += 1
# Introgression states
for i in range(n_int_AB):
for j in range(n_int_ABC):
a0_a1_t_vec = [t_B, cut_BC[i]]
a0_a1_mu_vec = [mu_B, mu_AB]
b0_b1_t_vec = [t_C, cut_BC[i]]
b0_b1_mu_vec = [mu_C, mu_AB]
c0_c1_t_vec = [t_A, t_AB, cut_ABC[j]]
c0_c1_mu_vec = [mu_A, mu_AB, mu_ABC]
ab0_ab1_t_vec = [t_AB + t_m - cut_BC[i + 1], cut_ABC[j]]
ab0_ab1_mu_vec = [mu_AB, mu_ABC]
a1b1_ab0_t = cut_BC[i + 1] - cut_BC[i]
a1b1_ab0_mu = mu_AB
ab1c1_abc0_t = (
cut_ABC[j + 1] - cut_ABC[j] if j != (n_int_ABC - 1) else t_upper
)
ab1c1_abc0_mu = mu_ABC
add = (
t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
if j != (n_int_ABC - 1)
else 0
)
# d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
d0_abc0_t_vec = [t_out] + [add]
# d0_abc0_mu_vec = [mu_D, mu_ABC]
d0_abc0_mu_vec = [mu_D, mu_ABC]
emissions = calc_emissions_single_JC69(
a0_a1_t_vec,
b0_b1_t_vec,
a1b1_ab0_t,
ab0_ab1_t_vec,
ab1c1_abc0_t,
c0_c1_t_vec,
d0_abc0_t_vec,
a0_a1_mu_vec,
b0_b1_mu_vec,
a1b1_ab0_mu,
ab0_ab1_mu_vec,
ab1c1_abc0_mu,
c0_c1_mu_vec,
d0_abc0_mu_vec,
coal_BC,
coal_ABC,
)
new_emissions = {}
for k in list(emissions.keys()):
new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
markov = (4, i, j)
states[acc] = markov
probs[acc] = new_emissions
acc += 1
probs = pd.DataFrame(list(probs))
probs.insert(0, "hidden_state", states)
return probs