Source code for bioa.algorithms.reconstruction

# -*- coding: utf-8 -*-
"""
================================================
Algorithms for Viral Quasispecies Reconstruction
================================================
"""

__all__ = ["frequency_em",
           "max_bandwidth_strategy",
           "random_bandwidth_strategy",
           "max_frequency_strategy",
           "greedy_fork_resolution",
           "random_fork_resolution",
           "min_forest_fork_resolution",
           "least_squares_fork_balance",
           "min_unsplittable_flows_resolution",
           "amplicon_frequency_matrix_strategy",
           "amplicon_frequency_matrix_strategy_from_graph",
           "update_freq_matrix_from_lp_output"]


__author__ = """Nicholas Mancuso (nick.mancuso@gmail.com)"""
#   Copyright (C) 2011-2012 by
#   Nicholas Mancuso <nick.mancuso@gmail.com>
#   Bassam Tork <basamt@gmail.com>
#   All rights reserved.
#   BSD license.

import math
import collections as coll
import itertools as itools
import logging
import random as rdm
import sys
import bioa
import numpy
import networkx as netx


def frequency_em(graph, quasispecies, epsilon=0.001):
    """ Estimate the quasispecies' frequencies via Expectation Maximization.

    Parameters
    ----------
    graph : ReadGraph
        The read graph used to reconstruct the viral population.

    quasispecies : dict
        The reconstructed quasispecies.

    epsilon : float (optional, default=0.001)
        Threshold for convergence.

    Returns
    -------
    quasispecies : dict
        The reconstructed quasispecies with new frequencies.
    """

    em_graph = netx.Graph()

    # helper functions for frequency, probability, and counts
    def _get_qfreq(qsps):
        return em_graph.node[qsps]["freq"]

    def _set_qfreq(qsps, freq):
        em_graph.node[qsps]["freq"] = freq

    def _get_rfreq(read):
        return em_graph.node[read]["count"]

    def _get_p(qsps, r):
        return em_graph[qsps][r]["p"]

    def _set_p(qsps, r, p):
        em_graph[qsps][r]["p"] = p

    reads = graph.get_read_nodes()
    total_freq = 0.0
    for read in reads:
        freq = graph.get_node_freq(read)
        total_freq += freq
        em_graph.add_node(read, count=freq, color=1)

    init_f = 1.0 / len(quasispecies)
    for qsps in quasispecies:
        em_graph.add_node(qsps, color=0, freq=init_f)
        for read in quasispecies[qsps]:
            if read in [graph.sink, graph.source] \
                    or read not in reads:
                continue
            em_graph.add_edge(qsps, read)

    freqs = {qsps: _get_qfreq(qsps) for qsps in quasispecies}
    while True:
        # E step
        for qsps in quasispecies:
            f_qsps = _get_qfreq(qsps)
            for read in em_graph.neighbors(qsps):
                total_f = sum(_get_qfreq(nbr) for nbr in em_graph.neighbors(read))
                _set_p(qsps, read, f_qsps / total_f)

        # M step
        for qsps in quasispecies:
            total_p = sum(_get_p(qsps, r) * _get_rfreq(r) for r in em_graph.neighbors(qsps))
            qfreq = total_p / sum(_get_rfreq(r) for r in em_graph.neighbors(qsps))
            _set_qfreq(qsps, qfreq)

        delta_f = sum((_get_qfreq(qsps) - freqs[qsps]) ** 2.0 for qsps in quasispecies)
        if delta_f < epsilon:
            break

        freqs = {qsps: _get_qfreq(qsps) for qsps in quasispecies}

    total_f = sum(freq for freq in freqs.values())
    return {qsps: freq / total_f for qsps, freq in freqs.items()}


[docs]def amplicon_frequency_matrix_strategy_from_graph(graph, use_chi_sqrd=False):
    """ Reconstructs quasispecies from the supplied graph.

    An amplicon strategy is used to pair reads with one another using a most
    consistent function.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    use_chi_sqrd : bool (optional, default=False)
        Whether to use the chi squared test in selecting a guide or the
        uniform distribution.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.
    """
    if not graph:
        raise ValueError("Proper read graph expected!")

    freq_matrix = bioa.FrequencyMatrix(graph.get_amplicons(), True)
    se_positions = graph.get_start_end_positions()
    return _guide_distribution_method(freq_matrix, se_positions, use_chi_sqrd)


[docs]def amplicon_frequency_matrix_strategy(amplicons, use_chi_sqrd=False, norm=False):
    """ Reconstructs quasispecies from the supplied read matrix.

    An amplicon strategy is used to pair reads with one another using a most
    consistent function.

    Parameters
    ----------
    amplicons : Amplicons
       Amplicons data-structure where each amplicon is a list of reads. This will
       be used to construct the frequency matrix.

    use_chi_sqrd : bool (optional, default=False)
        Whether to use the chi squared test in selecting a guide or the uniform
        distribution.

    norm : bool (optional, default=False)
        Whether or not to normalize read counts to frequencies.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.
    """

    if norm:
        freq_matrix = bioa.FrequencyMatrix.construct_normalized_freq_matrix(amplicons, True)
    else:
        freq_matrix = bioa.FrequencyMatrix(amplicons, True)

    return _guide_distribution_method(freq_matrix, amplicons.se_positions, use_chi_sqrd)


[docs]def max_bandwidth_strategy(graph, epsilon=1e-3, factor=1.0, until_covered=False, use_em=False):
    """ Finds quasispecies over an amplicon-based read graph by repeatedly
    finding the max bandwidth path until there is no possibly path from the
    source to the sink.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    freq_name : str (optional, default="count")
        Name of the frequency or count property to use.

    epsilon : float (optional, default=1e-3)
        Threshold for edge values to consider.

    factor : float (optional, default=1.0)
        This is a scaling factor to reduce edge weights by. By default
        it will not change the amount.

    until_covered : bool (optional, default=False)
        Setting 'until_covered' to True will only output the first set
        of quasispecies to cover all edges. Setting to False will output
        the entire set.

    use_em : bool (optional, default=False)
        Determines whether or not to use naive frequency assignment to the
        Quasispecies or to use the Expectation Maximization method.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.
    """
    if not graph:
        raise ValueError("Non-empty Read-Graph expected!")

    def _max_band(graph):
        """ Find maximum bandwidth s-t path
        """
        source = graph.source
        return bioa.max_bandwidth(graph.graph, source, "count", epsilon)

    return _general_path_resolution(graph, _max_band, factor, until_covered, use_em)


[docs]def random_bandwidth_strategy(graph, freq_name="count", epsilon=1e-3, factor=1.0):
    """ Finds quasispecies over an amplicon-based read graph by repeatedly
    finding a random max-bandwidth path until there is no possible path
    from the source to the sink.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    freq_name : str (optional, default="count")
        Name of the frequency or count property to use.

    epsilon : float (optional, default=1e-3)
        Threshold for edge values to consider.

    factor : float (optional, default=1.0)
        This is a scaling factor to reduce edge weights by. By default it will
        not change the amount.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.
    """
    if not graph:
        raise ValueError("Non-empty Read-Graph expected!")

    def _rand_path(graph):
        """ Find a random s-t path in the graph such that each edge has at
            least epsilon count over it.
        """
        source = graph.source
        sink = graph.sink
        bwidth = {sink: sys.maxint}
        path = {source: [source]}

        node = source
        while node != sink:

            # filter only valid successors
            choices = [succ for succ in graph.successors(node) \
                        if graph.get_edge_freq(node, succ) > epsilon]

            # return if we are stuck
            if not choices:
                return {}, {}

            # randomly choose a successor
            next_node = rdm.choice(choices)
            count = graph.get_edge_freq(node, next_node)

            # if count is less, set as max bandwidth
            if count < bwidth[sink]:
                bwidth[sink] = count

            # path to next node is the path to current node, plus itself
            path[next_node] = path[node] + [next_node]
            node = next_node

        return bwidth, path

    return _general_path_resolution(graph, _rand_path, factor)


[docs]def max_frequency_strategy(graph, epsilon=1e-3, factor=1.0, until_covered=False):
    """ Finds quasispecies over an amplicon-based read graph by repeatedly
    finding a maximum andwidth path until there is no possible path
    from the source to the sink.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    freq_name : str (optional, default="count")
        Name of the frequency or count property to use.

    epsilon : float (optional, default=1e-3)
        Threshold for edge values to consider.

    factor : float (optional, default=1.0)
        This is a scaling factor to reduce edge weights by. By default
        it will not change the amount.

    until_covered : bool (optional, default=False)
        Setting 'until_covered' to True will only output the first set
        of quasispecies to cover all edges. Setting to False will output
        the entire set.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.
    """
    if not graph:
        raise ValueError("Non-empty Read-Graph expected!")

    def _max_freq_path(graph):
        """ Find a max-frequency s-t path in the graph such that each edge has
            at least epsilon count over it.
        """
        sink = graph.sink
        source = graph.source
        path = {}
        dist = {source: 0}
        final = {sink: [sink]}

        # since our graph is acyclic and directed (DAG) we can just use
        # topological sort + dynamic programming to solve in linear time.
        for node in netx.topological_sort(graph.graph):
            # if we haven't gotten to this node by now, we can't use it.
            if node not in dist:
                continue

            for next_node in graph.successors(node):
                # if cost to this edge is less than epsilon we can't use it
                ecost = graph.get_edge_freq(node, next_node)
                if ecost < epsilon:
                    continue
                # if the current cost to next_node is less than the cost using
                # (node, next_node) edge replace it
                cost = dist[node] + ecost
                if dist.get(next_node, epsilon) < cost:
                    dist[next_node] = cost
                    path[next_node] = node

        # if the max isn't the sink node, we didnt find a valid path
        if max(dist, key=lambda x: dist[x]) != sink:
            return {}, {}

        node = sink
        while node != source:
            next_node = path[node]
            final[sink].append(next_node)
            node = next_node
        final[sink].reverse()

        return dist, final

    return _general_path_resolution(graph, _max_freq_path, factor, until_covered)


[docs]def greedy_fork_resolution(graph, perfect_match=True):
    """ Greedily resolve forks in the read graph.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    perfect_match : bool (optional, default=True)
        Whether to match values that are equal before considering
        maximums.

    Returns
    -------
    graph : DecliquedAmpliconReadGraph
        Modified read graph with all forks resolved.
    """
    if not graph:
        raise ValueError("Non-empty Read-Graph expected!")

    epsilon = 0.001

    def _find_match(pq, value):
        match = None
        for read, count in pq:
            if math.fabs(value - count) <= epsilon:
                match = read
                break
        if match:
            pq.remove_node(match)

        return match

    def _find_perfect_matches(lpq, rpq, lmax, lread, rmax, rread):
        match_tuple = None
        if lmax >= rmax:
            if math.fabs(lmax - rmax) < epsilon:
                match_tuple = (lread, rread, lmax)
            else:
                match = _find_match(rpq, lmax)
                if match:
                    match_tuple = (lread, match, lmax)
                    rpq.add_node(rmax, rread)
        else:
            if math.fabs(lmax - rmax) < epsilon:
                match_tuple = (lread, rread, rmax)
            else:
                match = _find_match(lpq, rmax)
                if match:
                    match_tuple = (match, rread, rmax)
                    lpq.add_node(lmax, lread)

        return match_tuple

    def _greedy_resolve(lreads, rreads):
        # build max priority queues to get the max count read from
        # both the left reads and the right reads of the biclique
        resolution = []
        lpq = bioa.PQueue(lreads, behave="max")
        rpq = bioa.PQueue(rreads, behave="max")
        # greedily pair up reads, reducing their counts
        while lpq and rpq:
            lread, lmax = lpq.get_max()
            rread, rmax = rpq.get_max()

            if perfect_match:
                match = _find_perfect_matches(lpq, rpq, lmax, \
                            lread, rmax, rread)
                # we found a perfect matchup. continue
                if match:
                    resolution.append(match)
                    continue

            minmax = min([lmax, rmax])
            lmax -= minmax
            rmax -= minmax

            # Considering that the _sum_ of both sides is equal,
            # one, or both, may be exhausted of counts.
            if lmax > 0:
                lpq.add_node(lmax, lread)
            elif rmax > 0:
                rpq.add_node(rmax, rread)

            resolution.append((lread, rread, minmax))

        return resolution

    return _general_fork_resolution(graph, _greedy_resolve)


[docs]def random_fork_resolution(graph):
    """ Resolve forks by random matching fork vertices.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    Returns
    -------
    graph : DecliquedAmpliconReadGraph
        Modified read graph with all forks resolved.
    """
    if not graph:
        raise ValueError("Non-empty Read-Graph expected!")

    def _random_resolve(lreads, rreads):
        # build a set for left and right sides of the fork then randomly select
        # from each set and make an edge until counts are exhausted
        resolution = []
        left = set(lreads)
        right = set(rreads)
        while left and right:
            # randomly pair up reads, reducing their counts
            # the conversion to list is temporary until a better solution
            # is realized.
            lfreq, lread = rdm.choice(list(left))
            rfreq, rread = rdm.choice(list(right))

            left.remove((lfreq, lread))
            right.remove((rfreq, rread))

            minfreq = min([lfreq, rfreq])
            lfreq -= minfreq
            rfreq -= minfreq

            # Considering that the _sum_ of both sides is equal,
            # one, or both, may be exhausted of counts.
            if lfreq > 0:
                left.add((lfreq, lread))
            elif rfreq > 0:
                right.add((rfreq, rread))

            resolution.append((lread, rread, minfreq))

        return resolution

    return _general_fork_resolution(graph, _random_resolve)


[docs]def min_forest_fork_resolution(graph):
    """
    """
    try:
        import cplex
    except ImportError:
        raise ImportError(
               "min_forest_fork_resolution() requires cplex: http://cplex.com")

    if not graph:
        raise ValueError("Connected Read Graph required!")

    def _min_forest(lreads, rreads):
        resolution = []

        # w.l.o.g assume |L| <= |R|
        left_smaller = len(lreads) <= len(rreads)
        if left_smaller:
            lfreqset, lreadset = zip(*lreads)
            rfreqset, rreadset = zip(*rreads)
            lset = {lread: lfreq for lfreq, lread in lreads}
            rset = {rread: rfreq for rfreq, rread in rreads}
        else:
            lfreqset, lreadset = zip(*rreads)
            rfreqset, rreadset = zip(*lreads)
            rset = {lread: lfreq for lfreq, lread in lreads}
            lset = {rread: rfreq for rfreq, rread in rreads}

        lreadset = list(lreadset)
        rreadset = list(rreadset)

        model = cplex.Cplex()
        model.objective.set_sense(model.objective.sense.minimize)
        model.set_problem_type(model.problem_type.LP)
        # suppress outputs
        model.set_results_stream(None)
        model.set_warning_stream(None)
        model.set_error_stream(None)
        model.set_log_stream(None)

        # variables are over edges and over balance of a root
        edge_map = {str((lread, rread)): (lread, rread) for lread in lreadset \
                        for rread in rreadset}
        edge_variables = edge_map.keys()
        # add the objective (minimize residuals)
        var_types = [model.variables.type.binary] * len(edge_variables)
        model.variables.add(names=edge_variables, types=var_types)

        # add the absolute value of the total balance of a tree variables
        lower_bs = [0.0] * len(lreadset)
        var_types = [model.variables.type.continuous] * len(lreadset)
        model.variables.add(names=lreadset, lb=lower_bs, types=var_types)

        # objective is to minimize the sum of the absolute value balances
        model.objective.set_linear([(lread, 1.0) for lread in lreadset])
        model.objective.set_linear([(edge, 0.0) for edge in edge_variables])

        # add the constraint to only allow larger-side reads a single parent
        # \sum_i^L x_{ij} = 1 \forall j \in R
        for rread in rreadset:
            edge_vars = [str((lread, rread)) for lread in lreadset]
            coefficients = [1.0] * len(lreadset)
            lhs = cplex.SparsePair(edge_vars, coefficients)
            model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \
                                         rhs=[2.0])

        # add the constraints to find absolute value of the balance of a root
        # \sum_j^R r_j * x_{ij} - l_i <= y_i \forall i \in L
        # l_i - \sum_j^R r_j x_{ij} <= y_i \forall i \in L
        for lread in lreadset:
            lfreq = lset[lread]
            # first way
            edge_vars = [str((lread, rread)) for rread in rreadset]
            coefficients = [rfreq for rfreq in rfreqset]
            lhs = cplex.SparsePair(edge_vars + [lread], coefficients + [-1.0])
            model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \
                                         rhs=[lfreq])
            # then the other
            coefficients = [-rfreq for rfreq in rfreqset]
            lhs = cplex.SparsePair(edge_vars + [lread], coefficients + [-1.0])
            model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \
                                         rhs=[-lfreq])

        model.solve()
        # the model will solve which edges to choose, but we must still
        # calculate the frequency to place over each edge. While a tree
        # solution will be unique, if balancing is not done to graph beforehand
        # we may have # negative assignments. Therefore, we will assign as a
        # ratio.
        for edge, selected in zip(edge_variables, model.solution.get_values()):
            if not selected:
                continue
            left, right = edge_map[edge]
            lfreq = lset[left]
            rfreq = rset[right]
            freq = rfreq / lfreq
            # the resolution needs to be in actual left-right order
            if left_smaller:
                resolution.append((left, right, freq))
            else:
                resolution.append((right, left, freq))

        return resolution

    return _general_fork_resolution(graph, _min_forest)


[docs]def min_unsplittable_flows_resolution(graph, flow_num, nthreads=0, timeout=300,
        threshold=1e-5):
    """ Min-k unsplittable flows method.
    This tries to find k paths in the graph that cover the reads with minimum
    amount of flow.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    flow_num : int
        The number of paths to try.

    nthreads : int, optional default=0
        The number of threads to use.
        0 = Use maximum number of threads (i.e. CPUs)
        1 = Single Threaded
        n = Use n threads

    timeout : int, optional default=300
        The number of seconds to run cplex until timeout.

    threshold : float, optional default=1e-5
        The threshold for determining a variable being selected or not.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency as value.

    Notes
    -----
    Requires CPLEX and CPLEX Python module to be installed. http://www.cplex.com
    """
    try:
        import cplex
    except ImportError:
        raise ImportError(
               "min_unsplittable_flows_resolution() requires cplex: http://cplex.com")

    if not graph:
        raise ValueError("Connected Read Graph required!")
    if flow_num < 1:
        raise ValueError("Flow must be positive integer!")
    if nthreads < 0:
        raise ValueError("Number of threads must be at least 0" +\
                " (for default behavior)")
    if timeout < 1:
        raise ValueError("Timeout must be positive value!")


    # keep caches of variables for faster constraint building
    # helper functions to access constraints
    pred_cache = {}
    succ_cache = {}
    def pred_node_vars(flow, node, name_func):
        key = name_func((flow, node))
        if key not in pred_cache:
            edges = itools.product([flow], \
                        itools.product(graph.graph.predecessors(node), [node]))
            pred_cache[key] = map(name_func, edges)
        return pred_cache[key]

    def succ_node_vars(flow, node, name_func):
        key = name_func((flow, node))
        if key not in succ_cache:
            edges = itools.product([flow], \
                    itools.product([node], graph.graph.successors(node)))
            succ_cache[key] = map(name_func, edges)
        return succ_cache[key]

    logging.debug("Building MCF model")
    model = cplex.Cplex()
    model.set_problem_type(model.problem_type.MILP)
    # suppress outputs
    model.set_results_stream(None)
    model.set_warning_stream(None)
    model.set_error_stream(None)
    # set parameters
    model.parameters.threads.set(nthreads)
    model.parameters.timelimit.set(timeout)

    # add t-s edge to make into circulation problem
    source, sink = graph.source, graph.sink
    graph.add_edge(sink, source, 0)

    # just re-use this list
    edges = graph.graph.edges()

    # add the binary flow variables
    flow_range = range(flow_num)
    bin_var_name = str
    bvariables = map(bin_var_name, itools.product(flow_range, edges))
    var_types = [model.variables.type.binary] * len(bvariables)
    model.variables.add(names=bvariables, types=var_types)

    # add the real-valued flow variables
    frac_var_name = lambda x: "{0}*".format(x)
    variables = map(frac_var_name, itools.product(flow_range, edges))
    var_types = [model.variables.type.continuous] * len(variables)
    lbounds = [0.0] * len(variables)
    ubounds = [1.0] * len(variables)
    model.variables.add(names=variables, lb=lbounds, ub=ubounds, types=var_types)

    # objective is to minimize the sum of the flows
    obj_edge = itools.product(flow_range, [(sink, source)])
    #obj_bvars = itools.imap(lambda x: (bin_var_name(x), 0.0), obj_edge)
    obj_vars = itools.imap(lambda x: (frac_var_name(x), 1.0), obj_edge)
    model.objective.set_linear(list(itools.chain(obj_vars)))
    model.objective.set_sense(model.objective.sense.minimize)

    # constraint 1 is to cover all edges in decliqued graph
    for node, next_node in edges:
        e_variables = map(frac_var_name, itools.product(flow_range, \
                        [(node, next_node)]))
        coefficients = [1.0] * len(e_variables)
        lhs = cplex.SparsePair(e_variables, coefficients)
        freq = graph.get_edge_freq(node, next_node)
        model.linear_constraints.add(lin_expr=[lhs], senses=["G"], rhs=[freq])

    # constraint 2 is to make sure that flows conserved
    for flow, node in itools.product(flow_range, graph):
        in_variables = pred_node_vars(flow, node, bin_var_name)
        out_variables = succ_node_vars(flow, node, bin_var_name)
        in_coefficients = [1.0] * len(in_variables)
        out_coefficients = [-1.0] * len(out_variables)
        lhs = cplex.SparsePair(in_variables + out_variables, \
                in_coefficients + out_coefficients)
        model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[0.0])

    for flow, node in itools.product(flow_range, graph):
        in_variables = pred_node_vars(flow, node, frac_var_name)
        out_variables = succ_node_vars(flow, node, frac_var_name)
        in_coefficients = [1.0] * len(in_variables)
        out_coefficients = [-1.0] * len(out_variables)
        lhs = cplex.SparsePair(in_variables + out_variables, \
                in_coefficients + out_coefficients)
        model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[0.0])

    # constraint 3 is to select binary flows
    for flow, edge in itools.product(flow_range, edges):
        bin_var = bin_var_name((flow, edge))
        var = frac_var_name((flow, edge))
        lhs = cplex.SparsePair([bin_var, var], [1.0, -1.0])
        model.linear_constraints.add(lin_expr=[lhs], senses=["G"], rhs=[0.0])

    # constraint 4 is to make flows unsplittable
    for flow, node in itools.product(flow_range, graph):
        out_variables = succ_node_vars(flow, node, bin_var_name)
        out_coefficients = [1.0] * len(out_variables)
        lhs = cplex.SparsePair(out_variables, out_coefficients)
        model.linear_constraints.add(lin_expr=[lhs], senses=["L"], rhs=[1.0])

    con_num = model.linear_constraints.get_num()
    bin_num = model.variables.get_num_binary()
    frac_num = model.variables.get_num_semicontinuous()
    log_str = "MCF model has {} constraints".format(con_num)
    log_str += ", {} binary variables".format(bin_num)
    log_str += ", and {} semicontinuous variables".format(frac_num)
    logging.debug(log_str)

    model.solve()
    #values = dict(zip(bvariables, model.solution.get_values(bvariables)))
    fvalues = dict(zip(variables, model.solution.get_values(variables)))
    flow_paths = coll.defaultdict(list)
    flow_vals = coll.defaultdict(float)
    tflow = 0.0

    # remove dummy edge
    graph.remove_edge(sink, source)
    bfs_edges = list(netx.bfs_edges(graph.graph, graph.source))
    for flow, edge in itools.product(flow_range, bfs_edges):
        # if we've selected this edge for the kth flow AND it has actual flow value
        if fvalues[frac_var_name((flow, edge))] > threshold:
            u, v = edge
            if u == graph.source:
                tflow += fvalues[frac_var_name((flow, edge))]
                flow_vals[flow] += fvalues[frac_var_name((flow, edge))]
            flow_paths[flow].append(graph.get_node_read(v))

    final = dict([])
    for i, flow in enumerate(flow_vals):
        if flow_vals[flow] == 0.0:
            continue
        qs = bioa.reconstruct_sequence(flow_paths[flow], \
                graph.get_start_end_positions())
        final[qs] = flow_vals[flow] / tflow
        log_str = "Variant {0} has length {1} and frequency {2}"
        log_str = log_str.format(i, len(qs), final[qs])
        logging.debug(log_str)

    logging.debug("MCF found {} total variants".format(len(final)))
    return final


[docs]def least_squares_fork_balance(graph, weighted=False):
    """ Balance all forks in the graph by a least squares approach.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    weighted : bool (optional, default=False)
        Whether or not to calculate weighted least squares or
        ordinary least squares.

    Returns
    -------
    balanced_graph : DecliquedAmpliconReadGraph
        Amplicon read graph with all forks balanced.

    Notes
    -----
    This function requires cplex to be installed in order to work.
    http://cplex.com
    """
    try:
        import cplex
    except ImportError:
        raise ImportError(
                "least_sqares_fork_balance() requires cplex: http://cplex.com")

    if not graph:
        raise ValueError("Connected Read Graph required!")

    model = cplex.Cplex()
    model.objective.set_sense(model.objective.sense.minimize)
    model.set_problem_type(model.problem_type.QP)
    # suppress outputs
    model.set_results_stream(None)
    model.set_warning_stream(None)
    model.set_error_stream(None)
    model.set_log_stream(None)

    source = graph.source
    sink = graph.sink
    reads, forks = graph.get_read_and_fork_nodes()
    forks = forks - set([source, sink])

    # variables are over reads
    variables = map(str, reads)

    # add the objective (minimize residuals)
    lower_bs = [-graph.get_node_freq(read) for read in reads]
    upper_bs = [cplex.infinity] * len(reads)
    model.variables.add(names=variables, lb=lower_bs, ub=upper_bs)

    if not weighted:
        coefficients = [(var, var, 1.0) for var in variables]
    else:
        scale_coeffics = [1.0 / graph.get_node_freq(read) for read in reads]
        coefficients = [(var, var, coef) for var, coef in \
                            zip(variables, scale_coeffics)]

    model.objective.set_quadratic_coefficients(coefficients)

    # add balance constraint
    for fork in forks:
        preds = graph.predecessors(fork)
        succs = graph.successors(fork)

        b_vars = []
        coefficients = ([1.0] * len(preds)) + ([-1.0] * len(succs))
        tally = 0.0
        for pred in preds:
            freq = graph.get_node_freq(pred)
            b_vars.append(str(pred))
            tally -= freq

        for succ in succs:
            freq = graph.get_node_freq(succ)
            b_vars.append(str(succ))
            tally += freq

        lhs = cplex.SparsePair(b_vars, coefficients)
        model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[tally])

    # readjust graph
    model.solve()
    for read, offset in zip(variables, model.solution.get_values()):
        freq = graph.get_node_freq(read)
        graph.set_node_freq(read, freq + offset)

    return graph


[docs]def update_freq_matrix_from_lp_output(freq_matrix, filename, rounding=False):
    """ Updates the frequency matrix from the output of a balancing scheme.

    Parameters
    ----------
    read_matrix : Numpy matrix
        Numpy matrix or list of lists of actual reads for each amplicon.
        This will be used to construct the frequency matrix.

    filename : str
        Path to the file that contains the least squares output from a
        solver.

    rounding : bool (optional, default=False)
        Whether or not to round the results for each read frequency.

    Returns
    -------
    updated_frequency_matrix : Numpy matrix
        The frequency matrix with updated counts.

    Notes
    -----
    This should be deprecated in order to use to use a newer direct solver
    based approach.
    """
    def _parse_coord(strcoord):
        inner = strcoord[1:-1]
        split = inner.split(",")
        assert len(split) == 2, "Invalid coordinate!"
        return int(split[0]), int(split[1])

    with open(filename, "r") as file:
        for line in file:
            items = line.split("  ")
            length = len(items) - 1
            for idx in range(length):
                i, c = _parse_coord(items[idx])
                offset = float(items[idx + 1])
                freq_matrix[i][c][1] += offset

    if rounding:
        pass

    return freq_matrix


# Lets keep the internal functions here
def _general_path_resolution(graph, path_finder, factor=1.0, until_covered=False, use_em=False):
    """ This is our general algorithm to resolve paths.

    It will call the resolver function to get the pair: bandwidths and paths.
    The count of all edges in the source-sink path will be reduced by the
    bandwidth returned by the resolution function. This continues until no
    non-zero paths are left.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    path_finder : function
        Function that takes a graph. It should find a source-sink path in
        the graph. It returns a pair of dictionaries. The first is the
        bandwidth dictionary. It should accept at least the sink node as a
        key, returning the bandwidth of the source-sink path. The second is
        the path dictionary. It should at least accept the sink node as a
        key, return a list of nodes indicating the source-sink path.

    factor : float (optional, default=1.0)
        This is a scaling factor to reduce edge weights by. By default
        it will not change the amount.

    until_covered : bool (optional, default=False)
        Setting 'until_covered' to True will only output the first set
        of quasispecies to cover all edges. Setting to False will output
        the entire set.

    use_em : bool (optional, default=False)
        Determines whether or not to use naive frequency assignment to the
        Quasispecies or to use the Expectation Maximization method.

    Returns
    -------
    quasispecies : dict
        Quasispecies spectrum with sequences as keys and frequency
        as value.
    """
    quasispecies = coll.defaultdict(float)
    path_list = []
    qsps_path = {}
    final = {}
    total = 0.0
    sink = graph.sink
    se_positions = graph.get_start_end_positions()

    func_name = path_finder.__name__
    log_str = "Running generalized path-strategy with {0} as path-finder"
    log_str = log_str.format(func_name)
    logging.info(log_str)
    # We repeatedly get an s-t path and use that as a quasispecies until there
    # are no valid paths from the source to the sink
    while True:
        costs, paths = path_finder(graph)

        # there must exist a path from source -> sink to continue
        if sink not in costs:
            break

        cost = costs[sink]
        path = paths[sink]

        reads = (graph.get_node_read(node) for node in path)

        # join the reads and add to quasispecies list
        qs = bioa.reconstruct_sequence(reads, se_positions)
        quasispecies[qs] += cost

        total += cost

        # reduce the frequency of the path after our selection
        edge_set = set([])
        for idx, v in enumerate(path[:-1]):
            nextn = path[idx + 1]
            edge_set.add((v, nextn))
            e_freq = graph.get_edge_freq(v, nextn)
            graph.set_edge_freq(v, nextn, e_freq - (cost * factor))

        qsps_path[qs] = path
        path_list.append((cost, edge_set, qs))

    # the user may only want to output the first subset of quasispecies
    # that cover all edges rather than the entire set.
    if until_covered:
        total = 0.0
        nquasispecies = coll.defaultdict(float)
        edges = set(graph.graph.edges())
        path_list.sort(key=lambda trip: trip[0])
        while edges and path_list:
            path_trip = path_list.pop()
            edges = edges - path_trip[1]
            qs = path_trip[2]
            nquasispecies[qs] += path_trip[0]
            total += path_trip[0]

        # remove the qsps we didnt use from the path dict for EM
        for path_trip in path_list:
            # if factor < 1 there may be duplicates
            if path_trip[2] in qsps_path:
                del qsps_path[path_trip[2]]

        quasispecies = nquasispecies

    # normalize the bandwidths to be frequencies
    logging.debug("{0} found {1} variants".format(func_name, len(quasispecies)))
    if use_em:
        final = frequency_em(graph, qsps_path)
    else:
        i = 0
        for qs in quasispecies:
            freq = quasispecies[qs]
            final[qs] = freq / total
            log_str = "Variant {0} has length {1} and frequency {2}"
            log_str = log_str.format(i, len(qs), final[qs])
            logging.debug(log_str)
            i += 1

    return final


def _general_fork_resolution(graph, resolver):
    """ This is our general algorithm to resolve forks. It will continually
        call the resolver function on each fork until no forks exist in the
        graph.

    Parameters
    ----------
    graph : DecliquedAmpliconReadGraph
        Amplicon read graph with fork vertices added.

    resolver : function
        Function that takes a fork predecessor read/freq pairs,
        and successor read/freq pairs. It should return the resolution in
        list form as (lread, rread, freq) tuples.

    Returns
    -------
    graph : DecliquedAmpliconReadGraph
        Modified read graph with all forks resolved.
    """
    def _get_read_pairs(graph, func):
        """ This will return a list of (freq, read) pairs depending on the
            function used
        """
        return [(graph.get_node_freq(rd), rd) for rd in func(fork)]

    while True:
        forks = graph.get_forks()
        if not forks:
            break

        for fork in forks:
            # get the precessor and successor reads of the fork
            lreads = _get_read_pairs(graph, graph.predecessors)
            rreads = _get_read_pairs(graph, graph.successors)

            edgelist = resolver(lreads, rreads)
            for lread, rread, freq in edgelist:
                graph.split_edge(fork, lread, rread, freq)

            graph.remove_node(fork)

            reads = [r for f, r in lreads] + [r for f, r in rreads]
            graph.remove_nodes_from(reads)

    return graph


def _guide_distribution_method(freq_matrix, se_positions, use_chi_sqrd):
    """
    """

    total = 0.0
    quasispecies = {}
    variants = {}
    row, col = freq_matrix.shape
    read_count = row * col
    examined = {}

    # just shortcircuit here if no columns to check.
    if col == 0:
        return {}

    while True:
        guide = _choose_guide(freq_matrix, use_chi_sqrd)

        # matching with other amplicons
        for gdx, gpair in enumerate(freq_matrix[guide]):
            gread, gfreq = gpair
            examined[(guide, gdx)] = True

            # we cannot use this read
            if gfreq == 0:
                continue

            start, end = se_positions[guide]
            if guide != 0:
                lstart, lend = se_positions[guide - 1]
                overlap = lend - start
            else:
                # doesnt matter it wont be used
                overlap = 0

            # match to the "left" of the amplicon
            lreads = _find_reads(gread, gdx, reversed(freq_matrix[:guide]), \
                        overlap, examined, swap_compare=True)
            lreads.reverse()
            lreads.append((gread, gdx))

            if guide != len(freq_matrix) - 1:
                rstart, rend = se_positions[guide + 1]
                overlap = end - rstart
            else:
                # doesnt matter it wont be used
                overlap = 0

            # match to the "right" of the amplicon
            rreads = _find_reads(gread, gdx, freq_matrix[guide + 1:], \
                        overlap, examined)
            lreads.extend(rreads)

            # we should have a read from each amplicon
            if len(lreads) == len(freq_matrix):
                # this unzips the list and gives us
                # the list of reads and list of indices
                reads, idxs = zip(*lreads)

                # create the variant, add it to the list,
                # and update the frequency matrix
                quasispecies[bioa.reconstruct_sequence(reads, se_positions)] = gfreq
                _update_freq_matrix(freq_matrix, idxs, gfreq)
                total += gfreq

        # if a column is empty, we"ve exhausted our possiblities
        # and should finish, or if all items have been examined
        # we should finish.
        if _has_empty_column(freq_matrix) or len(examined) == read_count:
            break

    for qs, freq in quasispecies.items():
        variants[qs] = freq / total

    return variants


def _has_empty_column(freq_matrix):
    """ Determines if the frequency matrix has a column of all zeros.

    Parameters
    ----------
    freq_matrix : numpy matrix
        The frequency matrix over the reads.

    Returns
    -------
    contains_empty_column : bool
            True if the matrix contains a column of all zeros. False otherwise.
    """

    func = lambda x, y: x and y
    exhausted = False
    for col in freq_matrix:
        if not reduce(func, [f for r, f in col]):
            exhausted = True
            break
    return exhausted


def _update_freq_matrix(freq_matrix, indices, freq):
    """
    This updates the frequency matrix by subtracting the supplied frequency from
    the found reads.

    Parameters
    ----------

    freq_matrix : bioa.FrequencyMatrix
        The frequency matrix over the reads.

    indices : list
        List of indices where the chosen reads were in the original matrix.

    freq : float
        The frequency to deduct from other reads.

    Returns
    -------
    None
    """

    for cdx, rdx in enumerate(indices):
        read, val = freq_matrix[cdx][rdx]
        val = max([0, val - freq])
        freq_matrix[cdx][rdx] = (read, val)

    return


def _find_reads(seed, seedx, columns, overlap, examined, swap_compare=False):
    """
    Finds all reads over the columns given
    the seed.

    Parameters
    ----------

    seed = The read to begin matching consistent
        reads with.

    seedx = The index of the initial seed.

    columns = List of columns to iterate over,
        searching for reads.

    overlap = The amount that any neighboring reads
        overlap by.

    examined = Boolean dictionary to indicate that we have
        examined a given read.

    swap_compare = Whether or not to swap the order
        of the comparing reads for consistency.

    Returns
    -------

    A list of pairs. The first element of the tuple is the read that is
    consistent with its neighbor. The second item is the index of the found
    read. An empty list is returned if none are found.
    """

    readidxpairs = []
    tmpread = seed
    tmpsdx = seedx
    for cdx, col in enumerate(columns):
        read, idx = _find_consistent_read(tmpread, tmpsdx, cdx, col, \
                        overlap, examined, swap_compare)
        if read != "":
            readidxpairs.append((read, idx))
            tmpread = read
            tmpsdx = idx

    return readidxpairs


def _find_consistent_read(seed, seedx, colidx, column, overlap, \
        examined, swap_compare):
    """ Finds a consistent read given the seed and other columns to search.

    Parameters
    ----------
    seed : str
        The read to match a consistent read with.

    seedx : int
        The index of the seed.

    colidx : int
        The index of the current column in the original frequency matrix.

    column : list
        The current column to iterate over to check for a consistent read.

    overlap : int
        The amount that any neighboring reads overlap by.

    examined : dict
        Boolean dictionary to indicate that we have examined a given read.

    swap_compare : bool
        Whether or not to swap the order of the comparing reads for consistency.

    Returns
    -------
    read_and_position : (str, int)
        The consistent read _in the current_ column, and
        its index. Empty string and -1 if not found.
    """

    read = ""
    mdx = sys.maxint
    tmpmdx = mdx
    ridx = -1
    consistent = False
    for idx, pair in enumerate(column):
        pread, pfreq = pair
        examined[(colidx, idx)] = True
        # if its an empty read, no bother checking for consistency
        if pread == "":
            continue

        # if we are going to the left in the matrix, we need
        # to swap the order in checking consistency
        if swap_compare:
            consistent = bioa.are_consistent(pread, seed, overlap)
        else:
            consistent = bioa.are_consistent(seed, pread, overlap)

        tmpmdx = abs(seedx - idx)
        if consistent and pfreq != 0 and tmpmdx < mdx:
            read = pread
            ridx = idx
            mdx = tmpmdx

    return (read, ridx)


def _choose_guide(freq_matrix, use_chi_sqrd=False):
    """
    Selects a guide amplicon from the read
    matrix. This can be selected by the
    uniform distribution or by maximum likelihood.

    Parameters:

    freq_matrix : bioa.FrequencyMatrix
        Numpy matrix where columns correspond to the absolute frequency
        (i.e. counts) distributions of distinct reads in the amplicons and rows
        correspond to distinct read-representatives with their associated
        frequencies.

    use_chi_sqrd : bool
        Whether to sample using the chi squared distribution or uniform
        distribution.

    Returns
    -------
    index : int
        A guide index.
    """
    def _pcs(ex, obs):
        return ((ex - obs) ** 2.0) / ex if ex != 0.0 else 0.0

    val = 0
    if use_chi_sqrd:
        # perform pearson chi-squared tests for all pairs use the amplicon
        # that has the least sum of all its chi-square tests
        final = numpy.zeros(len(freq_matrix))
        for idx, amp in enumerate(freq_matrix):
            chis = []
            reads, freqs = zip(*amp)
            for namp in freq_matrix:
                nreads, nfreqs = zip(*namp)
                pairs = zip(freqs, nfreqs)
                # $\chi^2 = \sum_i^n \frac{(O_i - E_i)^2}{E_i}$
                chisqr = sum([_pcs(pair[0], pair[1]) for pair in pairs])
                chis.append(chisqr)
            final[idx] = sum(chis)

        val = final.argmin()
    else:
        # just use uniform distribution to choose
        val = numpy.random.randint(0, len(freq_matrix))

    return val
Navigation

Quick search

Source code for bioa.algorithms.reconstruction

Navigation