Source code for bioa.algorithms.reconstruction

# -*- coding: utf-8 -*-
"""
================================================
Algorithms for Viral Quasispecies Reconstruction
================================================
"""

__all__ = ["frequency_em",
           "max_bandwidth_strategy",
           "random_bandwidth_strategy",
           "max_frequency_strategy",
           "greedy_fork_resolution",
           "random_fork_resolution",
           "min_forest_fork_resolution",
           "least_squares_fork_balance",
           "min_unsplittable_flows_resolution",
           "amplicon_frequency_matrix_strategy",
           "amplicon_frequency_matrix_strategy_from_graph",
           "update_freq_matrix_from_lp_output"]


__author__ = """Nicholas Mancuso (nick.mancuso@gmail.com)"""
#   Copyright (C) 2011-2012 by
#   Nicholas Mancuso <nick.mancuso@gmail.com>
#   Bassam Tork <basamt@gmail.com>
#   All rights reserved.
#   BSD license.

import math
import collections as coll
import itertools as itools
import logging
import random as rdm
import sys
import bioa
import numpy
import networkx as netx


def frequency_em(graph, quasispecies, epsilon=0.001):
    """ Estimate the quasispecies' frequencies via Expectation Maximization.

    Parameters
    ----------
    graph : ReadGraph
        The read graph used to reconstruct the viral population.

    quasispecies : dict
        The reconstructed quasispecies.

    epsilon : float (optional, default=0.001)
        Threshold for convergence.

    Returns
    -------
    quasispecies : dict
        The reconstructed quasispecies with new frequencies.
    """

    em_graph = netx.Graph()

    # helper functions for frequency, probability, and counts
    def _get_qfreq(qsps):
        return em_graph.node[qsps]["freq"]

    def _set_qfreq(qsps, freq):
        em_graph.node[qsps]["freq"] = freq

    def _get_rfreq(read):
        return em_graph.node[read]["count"]

    def _get_p(qsps, r):
        return em_graph[qsps][r]["p"]

    def _set_p(qsps, r, p):
        em_graph[qsps][r]["p"] = p

    reads = graph.get_read_nodes()
    total_freq = 0.0
    for read in reads:
        freq = graph.get_node_freq(read)
        total_freq += freq
        em_graph.add_node(read, count=freq, color=1)

    init_f = 1.0 / len(quasispecies)
    for qsps in quasispecies:
        em_graph.add_node(qsps, color=0, freq=init_f)
        for read in quasispecies[qsps]:
            if read in [graph.sink, graph.source] \
                    or read not in reads:
                continue
            em_graph.add_edge(qsps, read)

    freqs = {qsps: _get_qfreq(qsps) for qsps in quasispecies}
    while True:
        # E step
        for qsps in quasispecies:
            f_qsps = _get_qfreq(qsps)
            for read in em_graph.neighbors(qsps):
                total_f = sum(_get_qfreq(nbr) for nbr in em_graph.neighbors(read))
                _set_p(qsps, read, f_qsps / total_f)

        # M step
        for qsps in quasispecies:
            total_p = sum(_get_p(qsps, r) * _get_rfreq(r) for r in em_graph.neighbors(qsps))
            qfreq = total_p / sum(_get_rfreq(r) for r in em_graph.neighbors(qsps))
            _set_qfreq(qsps, qfreq)

        delta_f = sum((_get_qfreq(qsps) - freqs[qsps]) ** 2.0 for qsps in quasispecies)
        if delta_f < epsilon:
            break

        freqs = {qsps: _get_qfreq(qsps) for qsps in quasispecies}

    total_f = sum(freq for freq in freqs.values())
    return {qsps: freq / total_f for qsps, freq in freqs.items()}


[docs]def amplicon_frequency_matrix_strategy_from_graph(graph, use_chi_sqrd=False): """ Reconstructs quasispecies from the supplied graph. An amplicon strategy is used to pair reads with one another using a most consistent function. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. use_chi_sqrd : bool (optional, default=False) Whether to use the chi squared test in selecting a guide or the uniform distribution. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ if not graph: raise ValueError("Proper read graph expected!") freq_matrix = bioa.FrequencyMatrix(graph.get_amplicons(), True) se_positions = graph.get_start_end_positions() return _guide_distribution_method(freq_matrix, se_positions, use_chi_sqrd)
[docs]def amplicon_frequency_matrix_strategy(amplicons, use_chi_sqrd=False, norm=False): """ Reconstructs quasispecies from the supplied read matrix. An amplicon strategy is used to pair reads with one another using a most consistent function. Parameters ---------- amplicons : Amplicons Amplicons data-structure where each amplicon is a list of reads. This will be used to construct the frequency matrix. use_chi_sqrd : bool (optional, default=False) Whether to use the chi squared test in selecting a guide or the uniform distribution. norm : bool (optional, default=False) Whether or not to normalize read counts to frequencies. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ if norm: freq_matrix = bioa.FrequencyMatrix.construct_normalized_freq_matrix(amplicons, True) else: freq_matrix = bioa.FrequencyMatrix(amplicons, True) return _guide_distribution_method(freq_matrix, amplicons.se_positions, use_chi_sqrd)
[docs]def max_bandwidth_strategy(graph, epsilon=1e-3, factor=1.0, until_covered=False, use_em=False): """ Finds quasispecies over an amplicon-based read graph by repeatedly finding the max bandwidth path until there is no possibly path from the source to the sink. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. freq_name : str (optional, default="count") Name of the frequency or count property to use. epsilon : float (optional, default=1e-3) Threshold for edge values to consider. factor : float (optional, default=1.0) This is a scaling factor to reduce edge weights by. By default it will not change the amount. until_covered : bool (optional, default=False) Setting 'until_covered' to True will only output the first set of quasispecies to cover all edges. Setting to False will output the entire set. use_em : bool (optional, default=False) Determines whether or not to use naive frequency assignment to the Quasispecies or to use the Expectation Maximization method. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ if not graph: raise ValueError("Non-empty Read-Graph expected!") def _max_band(graph): """ Find maximum bandwidth s-t path """ source = graph.source return bioa.max_bandwidth(graph.graph, source, "count", epsilon) return _general_path_resolution(graph, _max_band, factor, until_covered, use_em)
[docs]def random_bandwidth_strategy(graph, freq_name="count", epsilon=1e-3, factor=1.0): """ Finds quasispecies over an amplicon-based read graph by repeatedly finding a random max-bandwidth path until there is no possible path from the source to the sink. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. freq_name : str (optional, default="count") Name of the frequency or count property to use. epsilon : float (optional, default=1e-3) Threshold for edge values to consider. factor : float (optional, default=1.0) This is a scaling factor to reduce edge weights by. By default it will not change the amount. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ if not graph: raise ValueError("Non-empty Read-Graph expected!") def _rand_path(graph): """ Find a random s-t path in the graph such that each edge has at least epsilon count over it. """ source = graph.source sink = graph.sink bwidth = {sink: sys.maxint} path = {source: [source]} node = source while node != sink: # filter only valid successors choices = [succ for succ in graph.successors(node) \ if graph.get_edge_freq(node, succ) > epsilon] # return if we are stuck if not choices: return {}, {} # randomly choose a successor next_node = rdm.choice(choices) count = graph.get_edge_freq(node, next_node) # if count is less, set as max bandwidth if count < bwidth[sink]: bwidth[sink] = count # path to next node is the path to current node, plus itself path[next_node] = path[node] + [next_node] node = next_node return bwidth, path return _general_path_resolution(graph, _rand_path, factor)
[docs]def max_frequency_strategy(graph, epsilon=1e-3, factor=1.0, until_covered=False): """ Finds quasispecies over an amplicon-based read graph by repeatedly finding a maximum andwidth path until there is no possible path from the source to the sink. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. freq_name : str (optional, default="count") Name of the frequency or count property to use. epsilon : float (optional, default=1e-3) Threshold for edge values to consider. factor : float (optional, default=1.0) This is a scaling factor to reduce edge weights by. By default it will not change the amount. until_covered : bool (optional, default=False) Setting 'until_covered' to True will only output the first set of quasispecies to cover all edges. Setting to False will output the entire set. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ if not graph: raise ValueError("Non-empty Read-Graph expected!") def _max_freq_path(graph): """ Find a max-frequency s-t path in the graph such that each edge has at least epsilon count over it. """ sink = graph.sink source = graph.source path = {} dist = {source: 0} final = {sink: [sink]} # since our graph is acyclic and directed (DAG) we can just use # topological sort + dynamic programming to solve in linear time. for node in netx.topological_sort(graph.graph): # if we haven't gotten to this node by now, we can't use it. if node not in dist: continue for next_node in graph.successors(node): # if cost to this edge is less than epsilon we can't use it ecost = graph.get_edge_freq(node, next_node) if ecost < epsilon: continue # if the current cost to next_node is less than the cost using # (node, next_node) edge replace it cost = dist[node] + ecost if dist.get(next_node, epsilon) < cost: dist[next_node] = cost path[next_node] = node # if the max isn't the sink node, we didnt find a valid path if max(dist, key=lambda x: dist[x]) != sink: return {}, {} node = sink while node != source: next_node = path[node] final[sink].append(next_node) node = next_node final[sink].reverse() return dist, final return _general_path_resolution(graph, _max_freq_path, factor, until_covered)
[docs]def greedy_fork_resolution(graph, perfect_match=True): """ Greedily resolve forks in the read graph. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. perfect_match : bool (optional, default=True) Whether to match values that are equal before considering maximums. Returns ------- graph : DecliquedAmpliconReadGraph Modified read graph with all forks resolved. """ if not graph: raise ValueError("Non-empty Read-Graph expected!") epsilon = 0.001 def _find_match(pq, value): match = None for read, count in pq: if math.fabs(value - count) <= epsilon: match = read break if match: pq.remove_node(match) return match def _find_perfect_matches(lpq, rpq, lmax, lread, rmax, rread): match_tuple = None if lmax >= rmax: if math.fabs(lmax - rmax) < epsilon: match_tuple = (lread, rread, lmax) else: match = _find_match(rpq, lmax) if match: match_tuple = (lread, match, lmax) rpq.add_node(rmax, rread) else: if math.fabs(lmax - rmax) < epsilon: match_tuple = (lread, rread, rmax) else: match = _find_match(lpq, rmax) if match: match_tuple = (match, rread, rmax) lpq.add_node(lmax, lread) return match_tuple def _greedy_resolve(lreads, rreads): # build max priority queues to get the max count read from # both the left reads and the right reads of the biclique resolution = [] lpq = bioa.PQueue(lreads, behave="max") rpq = bioa.PQueue(rreads, behave="max") # greedily pair up reads, reducing their counts while lpq and rpq: lread, lmax = lpq.get_max() rread, rmax = rpq.get_max() if perfect_match: match = _find_perfect_matches(lpq, rpq, lmax, \ lread, rmax, rread) # we found a perfect matchup. continue if match: resolution.append(match) continue minmax = min([lmax, rmax]) lmax -= minmax rmax -= minmax # Considering that the _sum_ of both sides is equal, # one, or both, may be exhausted of counts. if lmax > 0: lpq.add_node(lmax, lread) elif rmax > 0: rpq.add_node(rmax, rread) resolution.append((lread, rread, minmax)) return resolution return _general_fork_resolution(graph, _greedy_resolve)
[docs]def random_fork_resolution(graph): """ Resolve forks by random matching fork vertices. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. Returns ------- graph : DecliquedAmpliconReadGraph Modified read graph with all forks resolved. """ if not graph: raise ValueError("Non-empty Read-Graph expected!") def _random_resolve(lreads, rreads): # build a set for left and right sides of the fork then randomly select # from each set and make an edge until counts are exhausted resolution = [] left = set(lreads) right = set(rreads) while left and right: # randomly pair up reads, reducing their counts # the conversion to list is temporary until a better solution # is realized. lfreq, lread = rdm.choice(list(left)) rfreq, rread = rdm.choice(list(right)) left.remove((lfreq, lread)) right.remove((rfreq, rread)) minfreq = min([lfreq, rfreq]) lfreq -= minfreq rfreq -= minfreq # Considering that the _sum_ of both sides is equal, # one, or both, may be exhausted of counts. if lfreq > 0: left.add((lfreq, lread)) elif rfreq > 0: right.add((rfreq, rread)) resolution.append((lread, rread, minfreq)) return resolution return _general_fork_resolution(graph, _random_resolve)
[docs]def min_forest_fork_resolution(graph): """ """ try: import cplex except ImportError: raise ImportError( "min_forest_fork_resolution() requires cplex: http://cplex.com") if not graph: raise ValueError("Connected Read Graph required!") def _min_forest(lreads, rreads): resolution = [] # w.l.o.g assume |L| <= |R| left_smaller = len(lreads) <= len(rreads) if left_smaller: lfreqset, lreadset = zip(*lreads) rfreqset, rreadset = zip(*rreads) lset = {lread: lfreq for lfreq, lread in lreads} rset = {rread: rfreq for rfreq, rread in rreads} else: lfreqset, lreadset = zip(*rreads) rfreqset, rreadset = zip(*lreads) rset = {lread: lfreq for lfreq, lread in lreads} lset = {rread: rfreq for rfreq, rread in rreads} lreadset = list(lreadset) rreadset = list(rreadset) model = cplex.Cplex() model.objective.set_sense(model.objective.sense.minimize) model.set_problem_type(model.problem_type.LP) # suppress outputs model.set_results_stream(None) model.set_warning_stream(None) model.set_error_stream(None) model.set_log_stream(None) # variables are over edges and over balance of a root edge_map = {str((lread, rread)): (lread, rread) for lread in lreadset \ for rread in rreadset} edge_variables = edge_map.keys() # add the objective (minimize residuals) var_types = [model.variables.type.binary] * len(edge_variables) model.variables.add(names=edge_variables, types=var_types) # add the absolute value of the total balance of a tree variables lower_bs = [0.0] * len(lreadset) var_types = [model.variables.type.continuous] * len(lreadset) model.variables.add(names=lreadset, lb=lower_bs, types=var_types) # objective is to minimize the sum of the absolute value balances model.objective.set_linear([(lread, 1.0) for lread in lreadset]) model.objective.set_linear([(edge, 0.0) for edge in edge_variables]) # add the constraint to only allow larger-side reads a single parent # \sum_i^L x_{ij} = 1 \forall j \in R for rread in rreadset: edge_vars = [str((lread, rread)) for lread in lreadset] coefficients = [1.0] * len(lreadset) lhs = cplex.SparsePair(edge_vars, coefficients) model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \ rhs=[2.0]) # add the constraints to find absolute value of the balance of a root # \sum_j^R r_j * x_{ij} - l_i <= y_i \forall i \in L # l_i - \sum_j^R r_j x_{ij} <= y_i \forall i \in L for lread in lreadset: lfreq = lset[lread] # first way edge_vars = [str((lread, rread)) for rread in rreadset] coefficients = [rfreq for rfreq in rfreqset] lhs = cplex.SparsePair(edge_vars + [lread], coefficients + [-1.0]) model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \ rhs=[lfreq]) # then the other coefficients = [-rfreq for rfreq in rfreqset] lhs = cplex.SparsePair(edge_vars + [lread], coefficients + [-1.0]) model.linear_constraints.add(lin_expr=[lhs], senses=["L"], \ rhs=[-lfreq]) model.solve() # the model will solve which edges to choose, but we must still # calculate the frequency to place over each edge. While a tree # solution will be unique, if balancing is not done to graph beforehand # we may have # negative assignments. Therefore, we will assign as a # ratio. for edge, selected in zip(edge_variables, model.solution.get_values()): if not selected: continue left, right = edge_map[edge] lfreq = lset[left] rfreq = rset[right] freq = rfreq / lfreq # the resolution needs to be in actual left-right order if left_smaller: resolution.append((left, right, freq)) else: resolution.append((right, left, freq)) return resolution return _general_fork_resolution(graph, _min_forest)
[docs]def min_unsplittable_flows_resolution(graph, flow_num, nthreads=0, timeout=300, threshold=1e-5): """ Min-k unsplittable flows method. This tries to find k paths in the graph that cover the reads with minimum amount of flow. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. flow_num : int The number of paths to try. nthreads : int, optional default=0 The number of threads to use. 0 = Use maximum number of threads (i.e. CPUs) 1 = Single Threaded n = Use n threads timeout : int, optional default=300 The number of seconds to run cplex until timeout. threshold : float, optional default=1e-5 The threshold for determining a variable being selected or not. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. Notes ----- Requires CPLEX and CPLEX Python module to be installed. http://www.cplex.com """ try: import cplex except ImportError: raise ImportError( "min_unsplittable_flows_resolution() requires cplex: http://cplex.com") if not graph: raise ValueError("Connected Read Graph required!") if flow_num < 1: raise ValueError("Flow must be positive integer!") if nthreads < 0: raise ValueError("Number of threads must be at least 0" +\ " (for default behavior)") if timeout < 1: raise ValueError("Timeout must be positive value!") # keep caches of variables for faster constraint building # helper functions to access constraints pred_cache = {} succ_cache = {} def pred_node_vars(flow, node, name_func): key = name_func((flow, node)) if key not in pred_cache: edges = itools.product([flow], \ itools.product(graph.graph.predecessors(node), [node])) pred_cache[key] = map(name_func, edges) return pred_cache[key] def succ_node_vars(flow, node, name_func): key = name_func((flow, node)) if key not in succ_cache: edges = itools.product([flow], \ itools.product([node], graph.graph.successors(node))) succ_cache[key] = map(name_func, edges) return succ_cache[key] logging.debug("Building MCF model") model = cplex.Cplex() model.set_problem_type(model.problem_type.MILP) # suppress outputs model.set_results_stream(None) model.set_warning_stream(None) model.set_error_stream(None) # set parameters model.parameters.threads.set(nthreads) model.parameters.timelimit.set(timeout) # add t-s edge to make into circulation problem source, sink = graph.source, graph.sink graph.add_edge(sink, source, 0) # just re-use this list edges = graph.graph.edges() # add the binary flow variables flow_range = range(flow_num) bin_var_name = str bvariables = map(bin_var_name, itools.product(flow_range, edges)) var_types = [model.variables.type.binary] * len(bvariables) model.variables.add(names=bvariables, types=var_types) # add the real-valued flow variables frac_var_name = lambda x: "{0}*".format(x) variables = map(frac_var_name, itools.product(flow_range, edges)) var_types = [model.variables.type.continuous] * len(variables) lbounds = [0.0] * len(variables) ubounds = [1.0] * len(variables) model.variables.add(names=variables, lb=lbounds, ub=ubounds, types=var_types) # objective is to minimize the sum of the flows obj_edge = itools.product(flow_range, [(sink, source)]) #obj_bvars = itools.imap(lambda x: (bin_var_name(x), 0.0), obj_edge) obj_vars = itools.imap(lambda x: (frac_var_name(x), 1.0), obj_edge) model.objective.set_linear(list(itools.chain(obj_vars))) model.objective.set_sense(model.objective.sense.minimize) # constraint 1 is to cover all edges in decliqued graph for node, next_node in edges: e_variables = map(frac_var_name, itools.product(flow_range, \ [(node, next_node)])) coefficients = [1.0] * len(e_variables) lhs = cplex.SparsePair(e_variables, coefficients) freq = graph.get_edge_freq(node, next_node) model.linear_constraints.add(lin_expr=[lhs], senses=["G"], rhs=[freq]) # constraint 2 is to make sure that flows conserved for flow, node in itools.product(flow_range, graph): in_variables = pred_node_vars(flow, node, bin_var_name) out_variables = succ_node_vars(flow, node, bin_var_name) in_coefficients = [1.0] * len(in_variables) out_coefficients = [-1.0] * len(out_variables) lhs = cplex.SparsePair(in_variables + out_variables, \ in_coefficients + out_coefficients) model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[0.0]) for flow, node in itools.product(flow_range, graph): in_variables = pred_node_vars(flow, node, frac_var_name) out_variables = succ_node_vars(flow, node, frac_var_name) in_coefficients = [1.0] * len(in_variables) out_coefficients = [-1.0] * len(out_variables) lhs = cplex.SparsePair(in_variables + out_variables, \ in_coefficients + out_coefficients) model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[0.0]) # constraint 3 is to select binary flows for flow, edge in itools.product(flow_range, edges): bin_var = bin_var_name((flow, edge)) var = frac_var_name((flow, edge)) lhs = cplex.SparsePair([bin_var, var], [1.0, -1.0]) model.linear_constraints.add(lin_expr=[lhs], senses=["G"], rhs=[0.0]) # constraint 4 is to make flows unsplittable for flow, node in itools.product(flow_range, graph): out_variables = succ_node_vars(flow, node, bin_var_name) out_coefficients = [1.0] * len(out_variables) lhs = cplex.SparsePair(out_variables, out_coefficients) model.linear_constraints.add(lin_expr=[lhs], senses=["L"], rhs=[1.0]) con_num = model.linear_constraints.get_num() bin_num = model.variables.get_num_binary() frac_num = model.variables.get_num_semicontinuous() log_str = "MCF model has {} constraints".format(con_num) log_str += ", {} binary variables".format(bin_num) log_str += ", and {} semicontinuous variables".format(frac_num) logging.debug(log_str) model.solve() #values = dict(zip(bvariables, model.solution.get_values(bvariables))) fvalues = dict(zip(variables, model.solution.get_values(variables))) flow_paths = coll.defaultdict(list) flow_vals = coll.defaultdict(float) tflow = 0.0 # remove dummy edge graph.remove_edge(sink, source) bfs_edges = list(netx.bfs_edges(graph.graph, graph.source)) for flow, edge in itools.product(flow_range, bfs_edges): # if we've selected this edge for the kth flow AND it has actual flow value if fvalues[frac_var_name((flow, edge))] > threshold: u, v = edge if u == graph.source: tflow += fvalues[frac_var_name((flow, edge))] flow_vals[flow] += fvalues[frac_var_name((flow, edge))] flow_paths[flow].append(graph.get_node_read(v)) final = dict([]) for i, flow in enumerate(flow_vals): if flow_vals[flow] == 0.0: continue qs = bioa.reconstruct_sequence(flow_paths[flow], \ graph.get_start_end_positions()) final[qs] = flow_vals[flow] / tflow log_str = "Variant {0} has length {1} and frequency {2}" log_str = log_str.format(i, len(qs), final[qs]) logging.debug(log_str) logging.debug("MCF found {} total variants".format(len(final))) return final
[docs]def least_squares_fork_balance(graph, weighted=False): """ Balance all forks in the graph by a least squares approach. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. weighted : bool (optional, default=False) Whether or not to calculate weighted least squares or ordinary least squares. Returns ------- balanced_graph : DecliquedAmpliconReadGraph Amplicon read graph with all forks balanced. Notes ----- This function requires cplex to be installed in order to work. http://cplex.com """ try: import cplex except ImportError: raise ImportError( "least_sqares_fork_balance() requires cplex: http://cplex.com") if not graph: raise ValueError("Connected Read Graph required!") model = cplex.Cplex() model.objective.set_sense(model.objective.sense.minimize) model.set_problem_type(model.problem_type.QP) # suppress outputs model.set_results_stream(None) model.set_warning_stream(None) model.set_error_stream(None) model.set_log_stream(None) source = graph.source sink = graph.sink reads, forks = graph.get_read_and_fork_nodes() forks = forks - set([source, sink]) # variables are over reads variables = map(str, reads) # add the objective (minimize residuals) lower_bs = [-graph.get_node_freq(read) for read in reads] upper_bs = [cplex.infinity] * len(reads) model.variables.add(names=variables, lb=lower_bs, ub=upper_bs) if not weighted: coefficients = [(var, var, 1.0) for var in variables] else: scale_coeffics = [1.0 / graph.get_node_freq(read) for read in reads] coefficients = [(var, var, coef) for var, coef in \ zip(variables, scale_coeffics)] model.objective.set_quadratic_coefficients(coefficients) # add balance constraint for fork in forks: preds = graph.predecessors(fork) succs = graph.successors(fork) b_vars = [] coefficients = ([1.0] * len(preds)) + ([-1.0] * len(succs)) tally = 0.0 for pred in preds: freq = graph.get_node_freq(pred) b_vars.append(str(pred)) tally -= freq for succ in succs: freq = graph.get_node_freq(succ) b_vars.append(str(succ)) tally += freq lhs = cplex.SparsePair(b_vars, coefficients) model.linear_constraints.add(lin_expr=[lhs], senses=["E"], rhs=[tally]) # readjust graph model.solve() for read, offset in zip(variables, model.solution.get_values()): freq = graph.get_node_freq(read) graph.set_node_freq(read, freq + offset) return graph
[docs]def update_freq_matrix_from_lp_output(freq_matrix, filename, rounding=False): """ Updates the frequency matrix from the output of a balancing scheme. Parameters ---------- read_matrix : Numpy matrix Numpy matrix or list of lists of actual reads for each amplicon. This will be used to construct the frequency matrix. filename : str Path to the file that contains the least squares output from a solver. rounding : bool (optional, default=False) Whether or not to round the results for each read frequency. Returns ------- updated_frequency_matrix : Numpy matrix The frequency matrix with updated counts. Notes ----- This should be deprecated in order to use to use a newer direct solver based approach. """ def _parse_coord(strcoord): inner = strcoord[1:-1] split = inner.split(",") assert len(split) == 2, "Invalid coordinate!" return int(split[0]), int(split[1]) with open(filename, "r") as file: for line in file: items = line.split(" ") length = len(items) - 1 for idx in range(length): i, c = _parse_coord(items[idx]) offset = float(items[idx + 1]) freq_matrix[i][c][1] += offset if rounding: pass return freq_matrix # Lets keep the internal functions here
def _general_path_resolution(graph, path_finder, factor=1.0, until_covered=False, use_em=False): """ This is our general algorithm to resolve paths. It will call the resolver function to get the pair: bandwidths and paths. The count of all edges in the source-sink path will be reduced by the bandwidth returned by the resolution function. This continues until no non-zero paths are left. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. path_finder : function Function that takes a graph. It should find a source-sink path in the graph. It returns a pair of dictionaries. The first is the bandwidth dictionary. It should accept at least the sink node as a key, returning the bandwidth of the source-sink path. The second is the path dictionary. It should at least accept the sink node as a key, return a list of nodes indicating the source-sink path. factor : float (optional, default=1.0) This is a scaling factor to reduce edge weights by. By default it will not change the amount. until_covered : bool (optional, default=False) Setting 'until_covered' to True will only output the first set of quasispecies to cover all edges. Setting to False will output the entire set. use_em : bool (optional, default=False) Determines whether or not to use naive frequency assignment to the Quasispecies or to use the Expectation Maximization method. Returns ------- quasispecies : dict Quasispecies spectrum with sequences as keys and frequency as value. """ quasispecies = coll.defaultdict(float) path_list = [] qsps_path = {} final = {} total = 0.0 sink = graph.sink se_positions = graph.get_start_end_positions() func_name = path_finder.__name__ log_str = "Running generalized path-strategy with {0} as path-finder" log_str = log_str.format(func_name) logging.info(log_str) # We repeatedly get an s-t path and use that as a quasispecies until there # are no valid paths from the source to the sink while True: costs, paths = path_finder(graph) # there must exist a path from source -> sink to continue if sink not in costs: break cost = costs[sink] path = paths[sink] reads = (graph.get_node_read(node) for node in path) # join the reads and add to quasispecies list qs = bioa.reconstruct_sequence(reads, se_positions) quasispecies[qs] += cost total += cost # reduce the frequency of the path after our selection edge_set = set([]) for idx, v in enumerate(path[:-1]): nextn = path[idx + 1] edge_set.add((v, nextn)) e_freq = graph.get_edge_freq(v, nextn) graph.set_edge_freq(v, nextn, e_freq - (cost * factor)) qsps_path[qs] = path path_list.append((cost, edge_set, qs)) # the user may only want to output the first subset of quasispecies # that cover all edges rather than the entire set. if until_covered: total = 0.0 nquasispecies = coll.defaultdict(float) edges = set(graph.graph.edges()) path_list.sort(key=lambda trip: trip[0]) while edges and path_list: path_trip = path_list.pop() edges = edges - path_trip[1] qs = path_trip[2] nquasispecies[qs] += path_trip[0] total += path_trip[0] # remove the qsps we didnt use from the path dict for EM for path_trip in path_list: # if factor < 1 there may be duplicates if path_trip[2] in qsps_path: del qsps_path[path_trip[2]] quasispecies = nquasispecies # normalize the bandwidths to be frequencies logging.debug("{0} found {1} variants".format(func_name, len(quasispecies))) if use_em: final = frequency_em(graph, qsps_path) else: i = 0 for qs in quasispecies: freq = quasispecies[qs] final[qs] = freq / total log_str = "Variant {0} has length {1} and frequency {2}" log_str = log_str.format(i, len(qs), final[qs]) logging.debug(log_str) i += 1 return final def _general_fork_resolution(graph, resolver): """ This is our general algorithm to resolve forks. It will continually call the resolver function on each fork until no forks exist in the graph. Parameters ---------- graph : DecliquedAmpliconReadGraph Amplicon read graph with fork vertices added. resolver : function Function that takes a fork predecessor read/freq pairs, and successor read/freq pairs. It should return the resolution in list form as (lread, rread, freq) tuples. Returns ------- graph : DecliquedAmpliconReadGraph Modified read graph with all forks resolved. """ def _get_read_pairs(graph, func): """ This will return a list of (freq, read) pairs depending on the function used """ return [(graph.get_node_freq(rd), rd) for rd in func(fork)] while True: forks = graph.get_forks() if not forks: break for fork in forks: # get the precessor and successor reads of the fork lreads = _get_read_pairs(graph, graph.predecessors) rreads = _get_read_pairs(graph, graph.successors) edgelist = resolver(lreads, rreads) for lread, rread, freq in edgelist: graph.split_edge(fork, lread, rread, freq) graph.remove_node(fork) reads = [r for f, r in lreads] + [r for f, r in rreads] graph.remove_nodes_from(reads) return graph def _guide_distribution_method(freq_matrix, se_positions, use_chi_sqrd): """ """ total = 0.0 quasispecies = {} variants = {} row, col = freq_matrix.shape read_count = row * col examined = {} # just shortcircuit here if no columns to check. if col == 0: return {} while True: guide = _choose_guide(freq_matrix, use_chi_sqrd) # matching with other amplicons for gdx, gpair in enumerate(freq_matrix[guide]): gread, gfreq = gpair examined[(guide, gdx)] = True # we cannot use this read if gfreq == 0: continue start, end = se_positions[guide] if guide != 0: lstart, lend = se_positions[guide - 1] overlap = lend - start else: # doesnt matter it wont be used overlap = 0 # match to the "left" of the amplicon lreads = _find_reads(gread, gdx, reversed(freq_matrix[:guide]), \ overlap, examined, swap_compare=True) lreads.reverse() lreads.append((gread, gdx)) if guide != len(freq_matrix) - 1: rstart, rend = se_positions[guide + 1] overlap = end - rstart else: # doesnt matter it wont be used overlap = 0 # match to the "right" of the amplicon rreads = _find_reads(gread, gdx, freq_matrix[guide + 1:], \ overlap, examined) lreads.extend(rreads) # we should have a read from each amplicon if len(lreads) == len(freq_matrix): # this unzips the list and gives us # the list of reads and list of indices reads, idxs = zip(*lreads) # create the variant, add it to the list, # and update the frequency matrix quasispecies[bioa.reconstruct_sequence(reads, se_positions)] = gfreq _update_freq_matrix(freq_matrix, idxs, gfreq) total += gfreq # if a column is empty, we"ve exhausted our possiblities # and should finish, or if all items have been examined # we should finish. if _has_empty_column(freq_matrix) or len(examined) == read_count: break for qs, freq in quasispecies.items(): variants[qs] = freq / total return variants def _has_empty_column(freq_matrix): """ Determines if the frequency matrix has a column of all zeros. Parameters ---------- freq_matrix : numpy matrix The frequency matrix over the reads. Returns ------- contains_empty_column : bool True if the matrix contains a column of all zeros. False otherwise. """ func = lambda x, y: x and y exhausted = False for col in freq_matrix: if not reduce(func, [f for r, f in col]): exhausted = True break return exhausted def _update_freq_matrix(freq_matrix, indices, freq): """ This updates the frequency matrix by subtracting the supplied frequency from the found reads. Parameters ---------- freq_matrix : bioa.FrequencyMatrix The frequency matrix over the reads. indices : list List of indices where the chosen reads were in the original matrix. freq : float The frequency to deduct from other reads. Returns ------- None """ for cdx, rdx in enumerate(indices): read, val = freq_matrix[cdx][rdx] val = max([0, val - freq]) freq_matrix[cdx][rdx] = (read, val) return def _find_reads(seed, seedx, columns, overlap, examined, swap_compare=False): """ Finds all reads over the columns given the seed. Parameters ---------- seed = The read to begin matching consistent reads with. seedx = The index of the initial seed. columns = List of columns to iterate over, searching for reads. overlap = The amount that any neighboring reads overlap by. examined = Boolean dictionary to indicate that we have examined a given read. swap_compare = Whether or not to swap the order of the comparing reads for consistency. Returns ------- A list of pairs. The first element of the tuple is the read that is consistent with its neighbor. The second item is the index of the found read. An empty list is returned if none are found. """ readidxpairs = [] tmpread = seed tmpsdx = seedx for cdx, col in enumerate(columns): read, idx = _find_consistent_read(tmpread, tmpsdx, cdx, col, \ overlap, examined, swap_compare) if read != "": readidxpairs.append((read, idx)) tmpread = read tmpsdx = idx return readidxpairs def _find_consistent_read(seed, seedx, colidx, column, overlap, \ examined, swap_compare): """ Finds a consistent read given the seed and other columns to search. Parameters ---------- seed : str The read to match a consistent read with. seedx : int The index of the seed. colidx : int The index of the current column in the original frequency matrix. column : list The current column to iterate over to check for a consistent read. overlap : int The amount that any neighboring reads overlap by. examined : dict Boolean dictionary to indicate that we have examined a given read. swap_compare : bool Whether or not to swap the order of the comparing reads for consistency. Returns ------- read_and_position : (str, int) The consistent read _in the current_ column, and its index. Empty string and -1 if not found. """ read = "" mdx = sys.maxint tmpmdx = mdx ridx = -1 consistent = False for idx, pair in enumerate(column): pread, pfreq = pair examined[(colidx, idx)] = True # if its an empty read, no bother checking for consistency if pread == "": continue # if we are going to the left in the matrix, we need # to swap the order in checking consistency if swap_compare: consistent = bioa.are_consistent(pread, seed, overlap) else: consistent = bioa.are_consistent(seed, pread, overlap) tmpmdx = abs(seedx - idx) if consistent and pfreq != 0 and tmpmdx < mdx: read = pread ridx = idx mdx = tmpmdx return (read, ridx) def _choose_guide(freq_matrix, use_chi_sqrd=False): """ Selects a guide amplicon from the read matrix. This can be selected by the uniform distribution or by maximum likelihood. Parameters: freq_matrix : bioa.FrequencyMatrix Numpy matrix where columns correspond to the absolute frequency (i.e. counts) distributions of distinct reads in the amplicons and rows correspond to distinct read-representatives with their associated frequencies. use_chi_sqrd : bool Whether to sample using the chi squared distribution or uniform distribution. Returns ------- index : int A guide index. """ def _pcs(ex, obs): return ((ex - obs) ** 2.0) / ex if ex != 0.0 else 0.0 val = 0 if use_chi_sqrd: # perform pearson chi-squared tests for all pairs use the amplicon # that has the least sum of all its chi-square tests final = numpy.zeros(len(freq_matrix)) for idx, amp in enumerate(freq_matrix): chis = [] reads, freqs = zip(*amp) for namp in freq_matrix: nreads, nfreqs = zip(*namp) pairs = zip(freqs, nfreqs) # $\chi^2 = \sum_i^n \frac{(O_i - E_i)^2}{E_i}$ chisqr = sum([_pcs(pair[0], pair[1]) for pair in pairs]) chis.append(chisqr) final[idx] = sum(chis) val = final.argmin() else: # just use uniform distribution to choose val = numpy.random.randint(0, len(freq_matrix)) return val