Source code for itrails.int_get_emission_prob_mat

import numpy as np
import pandas as pd
from numba import njit
from scipy.linalg import expm
from scipy.special import comb

from itrails.cutpoints import cutpoints_AB, cutpoints_ABC



[docs]
def rate_mat_JC69(mu):
    """
    This function returns the rate matrix for the JC69 model.

    Parameters
    ----------
    mu : numeric
        Mutation rate
    """
    return np.full((4, 4), mu / 4) - np.diag([mu, mu, mu, mu])




[docs]
def p_b_given_a(t, Q):
    """
    This function calculates the probability of observing the
    nucleotide b given a, t and Q. a is the starting nucleotide,
    while b is the end nucleotide. t is the total time of the interval.

    P(b = bb | a == aa, Q, t)

    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b to c)
    Q : numpy array
        A 4x4 rate matrix for any substitution model
    """
    nt = ["A", "G", "C", "T"]
    mat = np.zeros((4, 4))
    for i in range(len(t)):
        mat = mat + t[i] * Q[i]
    arr = np.empty((4**2, 3))
    acc = 0
    mat = expm(mat)
    for aa in range(4):
        for bb in range(4):
            arr[acc] = [aa, bb, mat[aa, bb]]
            acc += 1
    df = pd.DataFrame(arr, columns=["a", "b", "prob"])
    df["a"] = [nt[int(i)] for i in df["a"]]
    df["b"] = [nt[int(i)] for i in df["b"]]
    return df




[docs]
@njit
def JC69_analytical_integral(aa, bb, cc, dd, t, mu, k):
    """
    This function calculates the probability of observing the
    nucleotides bb, cc and dd given aa, t and mu. aa and bb are the starting
    nucleotides, while cc is the end nucleotide. dd is the nucleotide at
    the time of coalescent. t is the total time of the interval. The
    returned value corresponds to integrating the coalescent to d over
    the entirety of t.

    P(b = bb, c == cc, d == dd | a == aa, mu, t)

          c     ^
          |     |
        __d__   | t
       |     |  |
       a     b  |

    Parameters
    ----------
    aa, bb, cc, dd : integer or string
        nucleotide at a, b, c and d respectively
    t : numeric
        Total time of the interval (from a/b/c to d)
    mu : numeric
        The mutation rate for the JC69 model
    k : numeric
        The coalescent rate
    """
    alpha = 3 / 4 if aa == dd else -1 / 4
    beta = 3 / 4 if dd == bb else -1 / 4
    gamma = 3 / 4 if dd == cc else -1 / 4

    res = (
        k
        * (
            ((-1 + np.exp(k * t)) * (np.exp(mu * t) + 16 * (alpha + beta) * gamma))
            / (np.exp((k + mu) * t) * k)
            + 4
            * (
                gamma / (np.exp(k * t) * (-k + mu))
                + (alpha + beta) / (k + mu)
                - (alpha + beta) / (np.exp((k + mu) * t) * (k + mu))
                + (4 * alpha * beta) / (k + 2 * mu)
                + (gamma * ((k - mu) ** (-1) + (16 * alpha * beta) / (k + mu)))
                / np.exp(mu * t)
                + (4 * alpha * beta * ((-4 * gamma) / (k + mu) - (k + 2 * mu) ** (-1)))
                / np.exp((k + 2 * mu) * t)
            )
        )
    ) / (64 * (1 - np.exp(-(k * t))))

    return res




[docs]
def p_b_c_given_a_JC69_analytical(t, mu, k):
    """
    This function returns a data frame with the values of
    P(b, c | a) for all combinations of nucleotides.

    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b/c to d)
    mu : numeric
        The mutation rate for the JC69 model
    k : numeric
        The coalescent rate
    """
    nt = ["A", "G", "C", "T"]
    arr = np.empty((4**3, 4))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                cumsum = 0
                for dd in range(4):
                    cumsum += JC69_analytical_integral(aa, bb, cc, dd, t, mu, k)
                arr[acc] = [aa, bb, cc, cumsum]
                acc += 1
    df = pd.DataFrame(arr, columns=["a", "b", "c", "prob"])
    df["a"] = [nt[int(i)] for i in df["a"]]
    df["b"] = [nt[int(i)] for i in df["b"]]
    df["c"] = [nt[int(i)] for i in df["c"]]
    return df




[docs]
@njit
def JC69_analytical_integral_double(aa, bb, cc, dd, ee, ff, t, mu):
    """
    This function calculates the probability of observing the
    nucleotides bb, cc, dd, ee and ff given aa, t and mu. aa, bb
    and cc are the starting nucleotides, while dd is the end nucleotide.
    ee is the nucleotide at the time of the first coalescent, while
    ff is the nucleotide at the time of the second coalescent. t is the
    total time of the interval. The returned value corresponds to integrating
    the coalescent to e and f over the entirety of t.

    Note that the coalescent rate is always 1 between two sequences, and 3
    between 3 sequences.

    P(b = bb, c == cc, d == dd, e == ee, f == ff | a == aa, Q, t)

              d       ^
              |       |
           ___f___    |
          |       |   | t
        __e__     |   |
       |     |    |   |
       a     b    c   |

    Parameters
    ----------
    aa, bb, cc, dd, ee, ff : integer or string
        nucleotide at a, b, c, d, e and f, respectively
    t : numeric
        Total time of the interval (from a/b/c to d)
    mu : numeric
        The mutation rate for the JC69 model
    """

    alpha = 3 / 4 if aa == ee else -1 / 4
    beta = 3 / 4 if ee == bb else -1 / 4
    gamma = 3 / 4 if ee == ff else -1 / 4
    delta = 3 / 4 if ff == cc else -1 / 4
    epsilon = 3 / 4 if ff == dd else -1 / 4

    res = (
        3
        * (
            (-2 * delta * (-2 - 8 * gamma + mu)) / (-6 + mu + mu**2)
            - (32 * alpha * beta * delta * (2 + mu + 8 * gamma * (1 + mu)))
            / (3 * (1 + mu) ** 2 * (2 + mu))
            - (32 * alpha * beta * epsilon * (2 + mu + 8 * gamma * (1 + mu)))
            / (np.exp(mu * t) * (1 + mu) * (2 + mu) * (3 + mu))
            - (
                8
                * alpha
                * beta
                * (1 + (16 * delta * epsilon) / np.exp(mu * t))
                * (2 + mu + 8 * gamma * (1 + mu))
            )
            / ((1 + mu) * (2 + mu) * (3 + 2 * mu))
            + (
                16
                * delta
                * gamma
                * (
                    (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                    + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                )
            )
            / ((-2 + mu) * (2 + mu) * (1 + 2 * mu))
            - (
                4
                * (alpha + beta)
                * (1 + 2 * gamma * (2 + mu))
                * (
                    (3 + 2 * mu) * (3 * np.exp(mu * t) + 4 * epsilon * (3 + mu))
                    + 12
                    * delta
                    * (np.exp(mu * t) * (3 + mu) + 4 * epsilon * (3 + 2 * mu))
                )
            )
            / (3 * np.exp(mu * t) * (2 + mu) * (3 + mu) * (3 + 2 * mu))
            - (
                2
                * epsilon
                * (
                    (2 + 8 * gamma - mu) / ((-3 + mu) * (-2 + mu))
                    + (
                        (1 + mu) * (2 + 8 * beta + mu)
                        + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                    )
                    / ((-1 + mu) * (1 + mu) * (2 + mu))
                )
            )
            / np.exp(mu * t)
            - (
                -16 * delta * epsilon * (2 + 8 * gamma - mu) * (2 + 3 * mu + mu**2)
                + np.exp(mu * t) * (-2 - 8 * gamma + mu) * (2 + 3 * mu + mu**2)
                - 3
                * np.exp(mu * t)
                * (-2 + mu)
                * (
                    (1 + mu) * (2 + 8 * beta + mu)
                    + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                )
                - 48
                * epsilon
                * (
                    2
                    * gamma
                    * (1 + mu)
                    * (
                        (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                        + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                    )
                    + delta
                    * (-2 + mu)
                    * (
                        (1 + mu) * (2 + 8 * beta + mu)
                        + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                    )
                )
            )
            / (6 * np.exp(mu * t) * (-2 + mu) * (1 + mu) * (2 + mu))
            + (
                2
                * (
                    2
                    * np.exp(mu * t)
                    * gamma
                    * (1 + mu)
                    * (
                        (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                        + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                    )
                    + delta
                    * (
                        32
                        * epsilon
                        * gamma
                        * (1 + mu)
                        * (
                            (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                            + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                        )
                        + np.exp(mu * t)
                        * (-2 + mu)
                        * (
                            (1 + mu) * (2 + 8 * beta + mu)
                            + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                        )
                    )
                )
            )
            / (np.exp(mu * t) * (1 + mu) ** 2 * (-4 + mu**2))
            + (
                (32 * alpha * beta * delta * (2 + mu + 8 * gamma * (1 + mu)))
                / (3 * (1 + mu) ** 2 * (2 + mu))
                + (
                    32
                    * alpha
                    * beta
                    * np.exp(mu * t)
                    * epsilon
                    * (2 + mu + 8 * gamma * (1 + mu))
                )
                / ((1 + mu) * (2 + mu) * (3 + mu))
                + (
                    8
                    * alpha
                    * beta
                    * np.exp(mu * t)
                    * (1 + (16 * delta * epsilon) / np.exp(mu * t))
                    * (2 + mu + 8 * gamma * (1 + mu))
                )
                / ((1 + mu) * (2 + mu) * (3 + 2 * mu))
                + (
                    4
                    * (alpha + beta)
                    * (1 + 2 * gamma * (2 + mu))
                    * (
                        (3 + 2 * mu)
                        * (
                            3 * np.exp(2 * mu * t)
                            + 4 * np.exp(2 * mu * t) * epsilon * (3 + mu)
                        )
                        + 12
                        * delta
                        * (
                            np.exp(mu * t) * (3 + mu)
                            + 4 * np.exp(mu * t) * epsilon * (3 + 2 * mu)
                        )
                    )
                )
                / (3 * (2 + mu) * (3 + mu) * (3 + 2 * mu))
                + np.exp(2 * (1 + mu) * t)
                * (
                    (2 * delta * (-2 - 8 * gamma + mu))
                    / (np.exp(2 * t) * (-6 + mu + mu**2))
                    - (
                        16
                        * delta
                        * gamma
                        * (
                            (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                            + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                        )
                    )
                    / (np.exp(mu * t) * (-2 + mu) * (2 + mu) * (1 + 2 * mu))
                    + 2
                    * np.exp(mu * t)
                    * epsilon
                    * (
                        (2 + 8 * gamma - mu) / (np.exp(2 * t) * (-3 + mu) * (-2 + mu))
                        + (
                            (1 + mu) * (2 + 8 * beta + mu)
                            + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                        )
                        / ((-1 + mu) * (1 + mu) * (2 + mu))
                    )
                    + (
                        -16
                        * delta
                        * epsilon
                        * (2 + 8 * gamma - mu)
                        * (2 + 3 * mu + mu**2)
                        + np.exp(mu * t) * (-2 - 8 * gamma + mu) * (2 + 3 * mu + mu**2)
                        - 3
                        * np.exp(2 * t + mu * t)
                        * (-2 + mu)
                        * (
                            (1 + mu) * (2 + 8 * beta + mu)
                            + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                        )
                        - 48
                        * np.exp(2 * t)
                        * epsilon
                        * (
                            2
                            * gamma
                            * (1 + mu)
                            * (
                                (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                                + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                            )
                            + delta
                            * (-2 + mu)
                            * (
                                (1 + mu) * (2 + 8 * beta + mu)
                                + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                            )
                        )
                    )
                    / (6 * np.exp(2 * t) * (-2 + mu) * (1 + mu) * (2 + mu))
                    - (
                        2
                        * (
                            2
                            * np.exp(mu * t)
                            * gamma
                            * (1 + mu)
                            * (
                                (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                                + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                            )
                            + delta
                            * (
                                32
                                * epsilon
                                * gamma
                                * (1 + mu)
                                * (
                                    (-1 + 2 * beta * (-2 + mu)) * (2 + mu)
                                    + 2 * alpha * (-2 + mu) * (2 + 8 * beta + mu)
                                )
                                + np.exp(mu * t)
                                * (-2 + mu)
                                * (
                                    (1 + mu) * (2 + 8 * beta + mu)
                                    + 8 * alpha * (1 + mu + 2 * beta * (2 + mu))
                                )
                            )
                        )
                    )
                    / (np.exp(mu * t) * (1 + mu) ** 2 * (-4 + mu**2))
                )
            )
            / np.exp(3 * (1 + mu) * t)
        )
    ) / (1024 * (1 + 0.5 / np.exp(3 * t) - 1.5 / np.exp(t)))
    return res




[docs]
def p_b_c_d_given_a_JC69_analytical(t, mu):
    """
    This function returns a data frame with the values of
    P(b, c, d | a) for all combinations of nucleotides.

    Parameters
    ----------
    t : numeric
        Total time of the interval (from a/b/c to d)
    mu : numeric
        The mutation rate for the JC69 model
    """
    nt = ["A", "G", "C", "T"]
    arr = np.empty((4**4, 5))
    acc = 0
    for aa in range(4):
        for bb in range(4):
            for cc in range(4):
                for dd in range(4):
                    cumsum = 0
                    for ee in range(4):
                        for ff in range(4):
                            res = JC69_analytical_integral_double(
                                aa, bb, cc, dd, ee, ff, t, mu
                            )
                            cumsum += res
                    arr[acc] = [aa, bb, cc, dd, cumsum]
                    acc += 1
    df = pd.DataFrame(arr, columns=["a", "b", "c", "d", "prob"])
    df["a"] = [nt[int(i)] for i in df["a"]]
    df["b"] = [nt[int(i)] for i in df["b"]]
    df["c"] = [nt[int(i)] for i in df["c"]]
    df["d"] = [nt[int(i)] for i in df["d"]]
    return df




[docs]
def b_c_d_given_a_to_dict_a_b_c_d(df):
    """
    This function converts the data frame as outputted
    by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
    into a dictionary. How to use the dictionary:

        P(b, c, d | a) = dct[a][b][c][d]

    Parameters
    ----------
    df : data frame
        As outputted by p_b_c_given_a_double_coal
    """
    # df = df.groupby(['a', 'b', 'c', 'd']).sum().reset_index()
    df = (
        df.groupby(["a", "b", "c"])
        .apply(lambda x: dict(zip(x.d, x.prob)))
        .reset_index()
    )
    df.columns = ["a", "b", "c", "val"]
    df = df.groupby(["a", "b"]).apply(lambda x: dict(zip(x.c, x.val))).reset_index()
    df.columns = ["a", "b", "val"]
    df = df.groupby("a").apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    return df




[docs]
def b_c_given_a_to_dict_a_b_c(df):
    """
    This function converts the data frame as outputted
    by p_b_c_given_a_single_coal or p_b_c_given_a_double_coal
    into a dictionary. How to use the dictionary:

        P(b, c | a) = dct[a][b][c]

    Parameters
    ----------
    df : data frame
        As outputted by p_b_c_given_a_single_coal
    """
    # df = df.groupby(['a', 'b', 'c']).sum().reset_index()
    df = df.groupby(["a", "b"]).apply(lambda x: dict(zip(x.c, x.prob))).reset_index()
    df.columns = ["a", "b", "val"]
    df = df.groupby("a").apply(lambda x: dict(zip(x.b, x.val))).to_dict()
    return df




[docs]
def b_given_a_to_dict_a_b(df):
    """
    This function converts the data frame as outputted
    by p_b_given_a into a dictionary. How to use the dictionary:

        P(b | a) = dct[a][b]

    Parameters
    ----------
    df : data frame
        As outputted by p_b_given_a
    """
    return df.groupby(["a"]).apply(lambda x: dict(zip(x.b, x.prob))).to_dict()




[docs]
def calc_emissions_single_JC69(
    a0_a1_t_vec,
    b0_b1_t_vec,
    a1b1_ab0_t,
    ab0_ab1_t_vec,
    ab1c1_abc0_t,
    c0_c1_t_vec,
    d0_abc0_t_vec,
    a0_a1_mu_vec,
    b0_b1_mu_vec,
    a1b1_ab0_mu,
    ab0_ab1_mu_vec,
    ab1c1_abc0_mu,
    c0_c1_mu_vec,
    d0_abc0_mu_vec,
    coal_rate_1,
    coal_rate_2,
):
    """
    This function returns the emission probabilities of a hidden
    state contining two coalescent events at different time intervals.

                    _________
                   |         |
         ---------abc0-----  |
               ____|___      |
              |        |     |
         ----ab1-------c1--  |
              |        |     |
         ----ab0-----  |     |
            __|__      |     |
           |     |     |     |
         --a1----b1--  |     |
           |     |     |     |
           a0    b0    c0    d0

    Parameters
    ----------
    a0_a1_t_vec, b0_b1_t_vec, c0_c1_t_vec,
    d0_abc0_t_vec, ab0_ab1_t_vec : numeric list
        Each list contains the interval time for a site to mutate
        with a certain mutation rate, specified by *mu_vec
    a1b1_ab0_t, ab1c1_abc0_t : numeric
        Time interval when the first and the second coalescent
        can happen, respectively.
    a0_a1_mu_vec, b0_b1_mu_vec, c0_c1_mu_vec,
    d0_abc0_mu_vec, ab0_ab1_mu_vec : numeric list
        Each list contains the mutation rates for each interval
        defined in *t_vec
    a1b1_ab0_mu, ab1c1_abc0_mu : numeric
        Mutation rates for the first and second coalescent intervals,
        respectively.
    coal_rate_1, coal_rate_2 : numeric
        Coalescent rate  of the first and second coalescent events,
        respectively.
    """

    # a0 to a1
    Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
    df_a = p_b_given_a(t=a0_a1_t_vec, Q=Q_vec)
    df_a = b_given_a_to_dict_a_b(df_a)
    # df_a[a0][a1]

    # b1 to b0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
    df_b = p_b_given_a(t=list(reversed(b0_b1_t_vec)), Q=Q_vec)
    df_b = b_given_a_to_dict_a_b(df_b)
    # df_b[b1][b0]

    # c1 to c0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
    df_c = p_b_given_a(t=list(reversed(c0_c1_t_vec)), Q=Q_vec)
    df_c = b_given_a_to_dict_a_b(df_c)
    # df_c[c1][c0]

    # abc0 to d0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
    df_d = p_b_given_a(t=list(reversed(d0_abc0_t_vec)), Q=Q_vec)
    df_d = b_given_a_to_dict_a_b(df_d)
    # df_d[abc0][d0]

    # ab0 to ab1
    Q_vec = [rate_mat_JC69(i) for i in ab0_ab1_mu_vec]
    df_ab = p_b_given_a(t=ab0_ab1_t_vec, Q=Q_vec)
    df_ab = b_given_a_to_dict_a_b(df_ab)
    # df_ab[ab0][ab1]

    # First coalescent
    df_first = p_b_c_given_a_JC69_analytical(
        t=a1b1_ab0_t, mu=a1b1_ab0_mu, k=coal_rate_1
    )
    df_first = b_c_given_a_to_dict_a_b_c(df_first)
    # df_first[a1][b1][ab0]

    # Second coalescent
    df_second = p_b_c_given_a_JC69_analytical(
        t=ab1c1_abc0_t, mu=ab1c1_abc0_mu, k=coal_rate_2
    )
    df_second = b_c_given_a_to_dict_a_b_c(df_second)
    # df_second[ab1][c1][abc0]

    emissions = {}
    for a0 in ["A", "C", "T", "G"]:
        for b0 in ["A", "C", "T", "G"]:
            for c0 in ["A", "C", "T", "G"]:
                for d0 in ["A", "C", "T", "G"]:
                    acc = 0
                    for a1 in ["A", "C", "T", "G"]:
                        for b1 in ["A", "C", "T", "G"]:
                            for c1 in ["A", "C", "T", "G"]:
                                for ab0 in ["A", "C", "T", "G"]:
                                    for ab1 in ["A", "C", "T", "G"]:
                                        for abc0 in ["A", "C", "T", "G"]:
                                            res = 1
                                            res = res * df_a[a0][a1]
                                            res = res * df_b[b1][b0]
                                            res = res * df_first[a1][b1][ab0]
                                            res = res * df_ab[ab0][ab1]
                                            res = res * df_second[ab1][c1][abc0]
                                            res = res * df_c[c1][c0]
                                            res = res * df_d[abc0][d0]
                                            acc += res
                    emissions[a0 + b0 + c0 + d0] = acc / 4

    return emissions




[docs]
def calc_emissions_double_JC69(
    a0_a1_t_vec,
    b0_b1_t_vec,
    c0_c1_t_vec,
    a1b1c1_abc0_t,
    d0_abc0_t_vec,
    a0_a1_mu_vec,
    b0_b1_mu_vec,
    c0_c1_mu_vec,
    a1b1c1_abc0_mu,
    d0_abc0_mu_vec,
):
    """
    This function returns the emission probabilities of a hidden
    state contining two coalescent events at the same time interval.

                    _________
                   |         |
         ---------abc0-----  |
               ____|___      |
            __|__      |     |
           |     |     |     |
         --a1----b1----c1--  |
           |     |     |     |
           a0    b0    c0    d0

    Parameters
    ----------
    a0_a1_t_vec, b0_b1_t_vec, c0_c1_t_vec, d0_abc0_t_vec : numeric list
        Each list contains the interval time for a site to mutate
        with a certain mutation rate, specified by *mu_vec
    a1b1c1_abc0_t : numeric
        Time interval for the coalescent events to happen.
    a0_a1_mu_vec, b0_b1_mu_vec, c0_c1_mu_vec, d0_abc0_mu_vec : numeric list
        Each list contains the mutation rates for each interval
        defined in *t_vec
    a1b1c1_abc0_mu : numeric
        Mutation rates for the interval where coalescents happen.
    """

    # a0 to a1
    Q_vec = [rate_mat_JC69(i) for i in a0_a1_mu_vec]
    df_a = p_b_given_a(t=a0_a1_t_vec, Q=Q_vec)
    df_a = b_given_a_to_dict_a_b(df_a)
    # df_a[a0][a1]

    # b1 to b0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(b0_b1_mu_vec))]
    df_b = p_b_given_a(t=list(reversed(b0_b1_t_vec)), Q=Q_vec)
    df_b = b_given_a_to_dict_a_b(df_b)
    # df_b[b1][b0]

    # c1 to c0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(c0_c1_mu_vec))]
    df_c = p_b_given_a(t=list(reversed(c0_c1_t_vec)), Q=Q_vec)
    df_c = b_given_a_to_dict_a_b(df_c)
    # df_c[c1][c0]

    # abc0 to d0
    Q_vec = [rate_mat_JC69(i) for i in list(reversed(d0_abc0_mu_vec))]
    df_d = p_b_given_a(t=list(reversed(d0_abc0_t_vec)), Q=Q_vec)
    df_d = b_given_a_to_dict_a_b(df_d)
    # df_d[abc0][d0]

    # Double coalescent
    df_double = p_b_c_d_given_a_JC69_analytical(t=a1b1c1_abc0_t, mu=a1b1c1_abc0_mu)
    df_double = b_c_d_given_a_to_dict_a_b_c_d(df_double)
    # df_double[a1][b1][c1][abc0]

    emissions = {}
    for a0 in ["A", "C", "T", "G"]:
        for b0 in ["A", "C", "T", "G"]:
            for c0 in ["A", "C", "T", "G"]:
                for d0 in ["A", "C", "T", "G"]:
                    acc = 0
                    for a1 in ["A", "C", "T", "G"]:
                        for b1 in ["A", "C", "T", "G"]:
                            for c1 in ["A", "C", "T", "G"]:
                                for abc0 in ["A", "C", "T", "G"]:
                                    res = 1
                                    res = res * df_a[a0][a1]
                                    res = res * df_b[b1][b0]
                                    res = res * df_c[c1][c0]
                                    res = res * df_double[a1][b1][c1][abc0]
                                    res = res * df_d[abc0][d0]
                                    acc += res
                    emissions[a0 + b0 + c0 + d0] = acc / 4
    return emissions




[docs]
def get_emission_prob_mat_introgression(
    t_A,
    t_B,
    t_AB,
    t_C,
    t_upper,
    t_out,
    t_m,
    rho_A,
    rho_B,
    rho_AB,
    rho_C,
    rho_ABC,
    coal_A,
    coal_B,
    coal_AB,
    coal_BC,
    coal_C,
    coal_ABC,
    n_int_AB,
    n_int_ABC,
    mu_A,
    mu_B,
    mu_C,
    mu_D,
    mu_AB,
    mu_ABC,
    cut_AB,
    cut_ABC,
):
    """
    This function returns the emission probabilities of all hidden states
    given a set of population genetics parameters. 
        
    #        |          |
    #        |  ABC  |\ \ 
    #        | AB |\ \ \ \ 
    #        / /\ \ \ \ \ \ 
    #       / /  \ \ \ \ \ \ 
    #       A      B   C   D
        
    Parameters
    ----------
    t_A : numeric
        Time between present time and the first speciation time for
        species A. 
    t_B : numeric
        Time between present time and the migration event for
        species A. 
    t_AB : numeric
        Time between speciation events.
    t_C : numeric
        Time between present time and the migration event for
        species C. 
    t_upper : numeric
        Time between the last ABC interval and the third speciation time.
    t_peak : numeric
        Mean divergence time between ABC and D after the third speciation time.
        It should be 4*coal_ABC (or it can be estimated instead). 
    rho_A, rho_B, rho_AB, rho_C, rho_ABC : numeric
        Recombination rates for the A, B, AB, C and ABC intervals.
    coal_A, coal_B, coal_AB, coal_C, coal_ABC : numeric
        Coalescent rates for the A, B, AB, C and ABC intervals.
    n_int_AB, n_int_ABC : integer
        Number of intervals in the AB and ABC parts of the tree. 
    mu_A, mu_B, mu_C, mu_D, mu_AB, mu_ABC : numeric
        Mutation rate for the A, B, C, D, AB and ABC intervals.
    """
    n_markov_states = (
        2 * n_int_AB * n_int_ABC + n_int_ABC * 3 + 3 * comb(n_int_ABC, 2, exact=True)
    )
    cut_BC = np.concatenate([[0], (cut_AB[1:] + t_m)])
    probs = np.empty((n_markov_states), dtype=object)
    states = np.empty((n_markov_states), dtype=object)

    # Deep coalescence, two single coalescents
    acc = 0
    for i in range(n_int_ABC):
        for j in range(i + 1, n_int_ABC):
            a0_a1_t_vec = [t_A, t_AB, cut_ABC[i]]
            a0_a1_mu_vec = [mu_A, mu_AB, mu_ABC]
            b0_b1_t_vec = [t_B + t_m, t_AB, cut_ABC[i]]
            b0_b1_mu_vec = [mu_B, mu_AB, mu_ABC]
            c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[i]]
            c0_c1_mu_vec = [mu_C, mu_ABC]
            ab0_ab1_t_vec = [cut_ABC[j] - cut_ABC[i + 1]]
            ab0_ab1_mu_vec = [mu_ABC]
            a1b1_ab0_t = cut_ABC[i + 1] - cut_ABC[i]
            a1b1_ab0_mu = mu_ABC
            ab1c1_abc0_t = (
                (cut_ABC[j + 1] - cut_ABC[j]) if j != (n_int_ABC - 1) else t_upper
            )
            ab1c1_abc0_mu = mu_ABC
            add = (
                t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
                if j != (n_int_ABC - 1)
                else 0
            )
            # d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
            d0_abc0_t_vec = [t_out] + [add]
            # d0_abc0_mu_vec = [mu_D, mu_ABC]
            d0_abc0_mu_vec = [mu_D, mu_ABC]

            # V1 states
            emissions = calc_emissions_single_JC69(
                a0_a1_t_vec,
                b0_b1_t_vec,
                a1b1_ab0_t,
                ab0_ab1_t_vec,
                ab1c1_abc0_t,
                c0_c1_t_vec,
                d0_abc0_t_vec,
                a0_a1_mu_vec,
                b0_b1_mu_vec,
                a1b1_ab0_mu,
                ab0_ab1_mu_vec,
                ab1c1_abc0_mu,
                c0_c1_mu_vec,
                d0_abc0_mu_vec,
                coal_ABC,
                coal_ABC,
            )
            states[acc] = (1, i, j)
            probs[acc] = emissions
            acc += 1

            # V2 states
            emissions = calc_emissions_single_JC69(
                a0_a1_t_vec,
                c0_c1_t_vec,
                a1b1_ab0_t,
                ab0_ab1_t_vec,
                ab1c1_abc0_t,
                b0_b1_t_vec,
                d0_abc0_t_vec,
                a0_a1_mu_vec,
                c0_c1_mu_vec,
                a1b1_ab0_mu,
                ab0_ab1_mu_vec,
                ab1c1_abc0_mu,
                b0_b1_mu_vec,
                d0_abc0_mu_vec,
                coal_ABC,
                coal_ABC,
            )
            new_emissions = {}
            for k in list(emissions.keys()):
                new_emissions[k[0] + k[2] + k[1] + k[3]] = emissions[k]
            states[acc] = (2, i, j)
            probs[acc] = new_emissions
            acc += 1

            # V3 states
            emissions = calc_emissions_single_JC69(
                b0_b1_t_vec,
                c0_c1_t_vec,
                a1b1_ab0_t,
                ab0_ab1_t_vec,
                ab1c1_abc0_t,
                a0_a1_t_vec,
                d0_abc0_t_vec,
                b0_b1_mu_vec,
                c0_c1_mu_vec,
                a1b1_ab0_mu,
                ab0_ab1_mu_vec,
                ab1c1_abc0_mu,
                a0_a1_mu_vec,
                d0_abc0_mu_vec,
                coal_ABC,
                coal_ABC,
            )
            new_emissions = {}
            for k in list(emissions.keys()):
                new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
            states[acc] = (3, i, j)
            probs[acc] = new_emissions
            acc += 1

    # Deep coalescence, one double coalescent
    for i in range(n_int_ABC):
        a0_a1_t_vec = [t_A, t_AB, cut_ABC[i]]
        a0_a1_mu_vec = [mu_A, mu_AB, mu_ABC]
        b0_b1_t_vec = [t_B + t_m, t_AB, cut_ABC[i]]
        b0_b1_mu_vec = [mu_B, mu_AB, mu_ABC]
        c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[i]]
        c0_c1_mu_vec = [mu_C, mu_ABC]
        a1b1c1_abc0_t = (
            (cut_ABC[i + 1] - cut_ABC[i]) if i != (n_int_ABC - 1) else t_upper
        )
        a1b1c1_abc0_mu = mu_ABC
        add = (
            t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[i + 1]
            if i != (n_int_ABC - 1)
            else 0
        )
        # d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
        d0_abc0_t_vec = [t_out] + [add]
        # d0_abc0_mu_vec = [mu_D, mu_ABC]
        d0_abc0_mu_vec = [mu_D, mu_ABC]

        # V1 states
        emissions = calc_emissions_double_JC69(
            a0_a1_t_vec,
            b0_b1_t_vec,
            c0_c1_t_vec,
            a1b1c1_abc0_t,
            d0_abc0_t_vec,
            a0_a1_mu_vec,
            b0_b1_mu_vec,
            c0_c1_mu_vec,
            a1b1c1_abc0_mu,
            d0_abc0_mu_vec,
        )
        markov = (1, i, i)
        states[acc] = markov
        probs[acc] = emissions
        acc += 1

        # V2 states
        emissions = calc_emissions_double_JC69(
            a0_a1_t_vec,
            c0_c1_t_vec,
            b0_b1_t_vec,
            a1b1c1_abc0_t,
            d0_abc0_t_vec,
            a0_a1_mu_vec,
            c0_c1_mu_vec,
            b0_b1_mu_vec,
            a1b1c1_abc0_mu,
            d0_abc0_mu_vec,
        )
        new_emissions = {}
        for k in list(emissions.keys()):
            new_emissions[k[0] + k[2] + k[1] + k[3]] = emissions[k]
        markov = (2, i, i)
        states[acc] = markov
        probs[acc] = new_emissions
        acc += 1

        # V3 states
        emissions = calc_emissions_double_JC69(
            b0_b1_t_vec,
            c0_c1_t_vec,
            a0_a1_t_vec,
            a1b1c1_abc0_t,
            d0_abc0_t_vec,
            b0_b1_mu_vec,
            c0_c1_mu_vec,
            a0_a1_mu_vec,
            a1b1c1_abc0_mu,
            d0_abc0_mu_vec,
        )
        new_emissions = {}
        for k in list(emissions.keys()):
            new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
        markov = (3, i, i)
        states[acc] = markov
        probs[acc] = new_emissions
        acc += 1

    # V0 states
    for i in range(n_int_AB):
        for j in range(n_int_ABC):
            a0_a1_t_vec = [t_A, cut_AB[i]]
            a0_a1_mu_vec = [mu_A, mu_AB]
            b0_b1_t_vec = [t_B + t_m, cut_AB[i]]
            b0_b1_mu_vec = [mu_B, mu_AB]
            c0_c1_t_vec = [t_C + t_m + t_AB, cut_ABC[j]]
            c0_c1_mu_vec = [mu_C, mu_ABC]
            ab0_ab1_t_vec = [t_AB - cut_AB[i + 1], cut_ABC[j]]
            ab0_ab1_mu_vec = [mu_AB, mu_ABC]
            a1b1_ab0_t = cut_AB[i + 1] - cut_AB[i]
            a1b1_ab0_mu = mu_AB
            ab1c1_abc0_t = (
                cut_ABC[j + 1] - cut_ABC[j] if j != (n_int_ABC - 1) else t_upper
            )
            ab1c1_abc0_mu = mu_ABC
            add = (
                t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
                if j != (n_int_ABC - 1)
                else 0
            )
            # d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
            d0_abc0_t_vec = [t_out] + [add]
            # d0_abc0_mu_vec = [mu_D, mu_ABC]
            d0_abc0_mu_vec = [mu_D, mu_ABC]

            emissions = calc_emissions_single_JC69(
                a0_a1_t_vec,
                b0_b1_t_vec,
                a1b1_ab0_t,
                ab0_ab1_t_vec,
                ab1c1_abc0_t,
                c0_c1_t_vec,
                d0_abc0_t_vec,
                a0_a1_mu_vec,
                b0_b1_mu_vec,
                a1b1_ab0_mu,
                ab0_ab1_mu_vec,
                ab1c1_abc0_mu,
                c0_c1_mu_vec,
                d0_abc0_mu_vec,
                coal_AB,
                coal_ABC,
            )
            states[acc] = (0, i, j)
            probs[acc] = emissions
            acc += 1

    # Introgression states
    for i in range(n_int_AB):
        for j in range(n_int_ABC):
            a0_a1_t_vec = [t_B, cut_BC[i]]
            a0_a1_mu_vec = [mu_B, mu_AB]
            b0_b1_t_vec = [t_C, cut_BC[i]]
            b0_b1_mu_vec = [mu_C, mu_AB]
            c0_c1_t_vec = [t_A, t_AB, cut_ABC[j]]
            c0_c1_mu_vec = [mu_A, mu_AB, mu_ABC]
            ab0_ab1_t_vec = [t_AB + t_m - cut_BC[i + 1], cut_ABC[j]]
            ab0_ab1_mu_vec = [mu_AB, mu_ABC]
            a1b1_ab0_t = cut_BC[i + 1] - cut_BC[i]
            a1b1_ab0_mu = mu_AB
            ab1c1_abc0_t = (
                cut_ABC[j + 1] - cut_ABC[j] if j != (n_int_ABC - 1) else t_upper
            )
            ab1c1_abc0_mu = mu_ABC
            add = (
                t_upper + cut_ABC[n_int_ABC - 1] - cut_ABC[j + 1]
                if j != (n_int_ABC - 1)
                else 0
            )
            # d0_abc0_t_vec = [t_A+t_AB+cut_ABC[n_int_ABC-1]+t_upper]+[t_peak+add]
            d0_abc0_t_vec = [t_out] + [add]
            # d0_abc0_mu_vec = [mu_D, mu_ABC]
            d0_abc0_mu_vec = [mu_D, mu_ABC]

            emissions = calc_emissions_single_JC69(
                a0_a1_t_vec,
                b0_b1_t_vec,
                a1b1_ab0_t,
                ab0_ab1_t_vec,
                ab1c1_abc0_t,
                c0_c1_t_vec,
                d0_abc0_t_vec,
                a0_a1_mu_vec,
                b0_b1_mu_vec,
                a1b1_ab0_mu,
                ab0_ab1_mu_vec,
                ab1c1_abc0_mu,
                c0_c1_mu_vec,
                d0_abc0_mu_vec,
                coal_BC,
                coal_ABC,
            )

            new_emissions = {}
            for k in list(emissions.keys()):
                new_emissions[k[2] + k[0] + k[1] + k[3]] = emissions[k]
            markov = (4, i, j)
            states[acc] = markov
            probs[acc] = new_emissions
            acc += 1

    probs = pd.DataFrame(list(probs))
    probs.insert(0, "hidden_state", states)

    return probs