Source code for bioa.util.validation

"""
========================================
Validation for reconstruction algorithms
========================================
"""

__all__ = ["hamming_distance",
           "mean_diversity",
           "shannon_entropy",
           "jensen_shannon_divergence",
           "kullback_leibler_divergence",
           "root_mean_square_deviation",
           "f_score",
           "sensitivity",
           "positive_predictive_value"]

__author__ = """Nicholas Mancuso (nick.mancuso@gmail.com)"""
#   Copyright (C) 2011-2012 by
#   Nicholas Mancuso <nick.mancuso@gmail.com>
#   Bassam Torq <basamt@gmail.com>
#   All rights reserved.
#   BSD license.

import math
import time
import itertools as itools

import bioa
import metric


[docs]def hamming_distance(str1, str2): """ Calculate the Hamming distance between two strings Parameters ---------- str1 : str First string to use in comparison str2 : str Second string to use in comparison Returns ------- distance : int The Hamming distance between strings. In other words, the number of character inequalities. """ if len(str1) != len(str2): raise ValueError("Strings must be equal in length!") return metric.hamming(str1, str2, len(str1))
[docs]def mean_diversity(amp1, amp2, overlap, sample_size): """ Estimate/Compute the mean overlap diversity. Parameters ---------- amp1 : bioa.Amplicons First amplicon set to use in estimation. amp2 : bioa.Amplicons Second amplicon set to use in estimation. overlap : int The amount that reads from amp1 overlap with reads in amp2. sample_size : int The amount to use in the sample for estimation. Returns ------- mean_diversity : float Mean overlap diversity value from the smaple. """ if not amp1 or not amp2: raise ValueError("Amplicons must not be empty!") seed = int(time.time()) return metric.diversity(amp1, amp2, overlap, sample_size, seed)
[docs]def shannon_entropy(prob_mass_map): """ Calculate the entropy of the given probability mass function. Parameters ---------- prob_mass_map : dictionary A dictionary mapping items to probabilities. Returns ------- entropy : float The shannon entropy given the PMF. """ return -sum([freq * math.log(freq, 2) for freq in \ prob_mass_map.values() if freq != 0])
[docs]def jensen_shannon_divergence(real, predicted): """ Quasi-metric for computing relative entropy with a midpoint. It is for computing Kullback-Leibler Divergence when the sets for comparison might not contain the same elements. Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict the calculated dictionary mapping quasispecies to frequencies. Returns ------- divergence : float The "distance" between the two distributions. Formally, the quality of the approximation of one distribution by the other distribution. """ midpoint = dict(real) for qs, freq in predicted.items(): if qs in midpoint: midpoint[qs] += freq else: midpoint[qs] = freq for qs, freq in midpoint.items(): midpoint[qs] = freq / 2.0 return 0.5 * (kullback_leibler_divergence(real, midpoint) + \ kullback_leibler_divergence(predicted, midpoint))
[docs]def kullback_leibler_divergence(real, predicted): """ Relative entropy between the real frequencies and the predicted frequencies. Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict the calculated dictionary mapping quasispecies to frequencies. Returns ------- divergence : float The "distance" between the two distributions. Formally, the quality of the approximation of one distribution by the other distribution. """ sum = 0.0 for qs, freq_r in real.items(): freq = predicted.get(qs, float("nan")) operand = math.log(freq_r / freq, 2.0) if freq_r != 0 and freq != 0 else 0 sum += freq_r * operand return sum
[docs]def root_mean_square_deviation(real, predicted): """ Root mean square deviation. If the sets do not completely overlap then RMSD is calculated over the intersection. Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict the calculated dictionary mapping quasispecies to frequencies. Returns ------- rmsd : float The root mean square deviation """ def _lookup(qs): rval = real[qs] if qs in real else 0.0 pval = predicted[qs] if qs in predicted else 0.0 return (rval - pval)**2.0 inter = set(real.keys()) & set(predicted.keys()) if len(inter) == 0: return float("inf") mean = sum(itools.imap(_lookup, inter)) / float(len(inter)) return math.sqrt(mean)
def f_score(real, predicted): """ F-Score, or F1-Score is defined as: `2 * PPV * Senstivity / (PPV + Sensitivity)` It can be thought of as a weighted average of PPV and Senstivity (Precision and Recall). Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict The calculated dictionary mapping quasispecies to frequencies. Returns ------- f_score : float Weighted average of PPV and Senstivity (Precision and Recall). """ ppv = bioa.positive_predicted_value(real, predicted) sen = bioa.sensitivity(real, predicted) _sum = float(ppv + sen) return (2.0 * ppv * sen) / _sum if _sum else 0.0
[docs]def sensitivity(real, predicted): """ Sensitivity = `|True Positives| / (|True Positives| + |False Negatives|)` Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict The calculated dictionary mapping quasispecies to frequencies. Returns ------- sensitivity : float The sensitivity. """ truepos = set(predicted.keys()) & set(real.keys()) return len(truepos) / float(len(real))
[docs]def positive_predictive_value(real, predicted): """ PPV = `|True Positives| / (|True Positives| + |False Positives|)` Parameters ---------- real : dict A dictionary mapping quasispecies to frequencies. predicted : dict The calculated dictionary mapping quasispecies to frequencies. Returns ------- PPV : float The positive predicted value """ truepos = set(predicted.keys()) & set(real.keys()) return len(truepos) / float(len(predicted))