Source code for bioa.io.amplicon

# -*- coding: utf-8 -*-
"""
==========================================
Input/Output algorithms for bioa.Amplicons
==========================================
"""

__all__ = ["find_amplicons_by_primers",
           "parse_start_end_positions",
           "build_amplicons_from_file",
           "build_uniform_amplicons_from_file",
           "write_amplicons_to_fasta"]

__author__ = """Nicholas Mancuso (nick.mancuso@gmail.com)"""
# Copyright (C) 2011-2013 by
# Nicholas Mancuso <nick.mancuso@gmail.com>
# All rights reserved.
# BSD license.

import csv
import bioa as b
from Bio import SeqIO


[docs]def find_amplicons_by_primers(reference, primers):
    """ Finds the amplicon starting and ending positions
    in the reference given the primers used for the experiments.

    Parameters
    ----------
    reference : Bio.Seq
        The reference sequence used to align reads with.

    primers : list of Bio.SeqRecord
        List containing starting and ending primers for all
        experiments. Primers should be ordered by the amplicon order
        along the reference. Additionally each pair should be ordered
        by starting primer then ending primer.

    Returns
    -------
    se_positions : list of (int, int)
        List of integer pairs indicating the starting and ending positions
        for each amplicon in the reference.
    """
    if not reference:
        raise ValueError("Must provide a reference to find amplicons!")
    if not primers:
        raise ValueError("Must provide a primer list!")

    if len(primers) % 2 != 0:
        raise ValueError("Must be starting and ending primers for each amplicon!")

    # alignment scores
    MATCH = 2
    MISMATCH = -1
    OPEN_GAP = -0.5
    EXTEND_GAP = -0.1
    from Bio.pairwise2 import align
    # wrapper function to find best local alignment
    def _best(reference, seq):
        return max(align.localms(reference, seq, MATCH, MISMATCH,
                    OPEN_GAP, EXTEND_GAP))

    se_positions = []
    for idx in range(0, len(primers) - 1, 2):
        record = primers[idx]
        nextrecord = primers[idx + 1]
        # do a local alignment of the primers to the reference.
        s1, s2, score, begin, _ = _best(reference, record.seq)

        # reverse complement for the backward primer
        reverse = nextrecord.seq.reverse_complement()
        s1, s2, score, _, end = _best(reference, reverse)

        # account for 1-based positions
        se_positions.append((begin + 1, end + 1))

    return se_positions


def parse_start_end_positions(stream):
    """ Parse a stream that contains start and end positions
    for the defined amplicons. One pair per line, e.g.,
        1, 250
        183, 400
    would be the starting and ending positions for two amplicons.

    Parameters
    ----------
    stream : stream type
        The stream to parse the pairs from

    Returns
    -------
    pairs : iterable of (int, int)
        The starting and ending positions.
    """
    reader = csv.reader(stream)
    for row in reader:
        if len(row) == 2:
            yield (int(row[0]), int(row[1]))
    return


[docs]def build_amplicons_from_file(filename, se_positions, ref_len):
    """ Creates the Amplicons data structure from a file of reads.
    The file structure must follow:
        read, position
    for this to operate correctly.

    Parameters
    ----------
    filename : str
        Path to the read-data file.

    se_positions : list of (int, int)
        List of starting and ending positions for each amplicon.

    ref_len : int
        Length of the entire reference genome.

    Returns
    -------
    amplicons : bioa.Amplicons
        Amplicon data structure containing all distinct reads and meta-data.
    """

    amplicons = []
    with open(filename, "r") as data_file:
        amplicons = b.Amplicons(data_file, se_positions, ref_len)

    return amplicons


[docs]def build_uniform_amplicons_from_file(filename, amp_len, overlap, ref_len):
    """ Creates the Amplicons data structure from a file of reads.
    The file structure must follow:
            read, position
    for this to operate correctly. This assumes that all neigboring
    amplicons have the same overlap.

    Parameters
    ----------
    filename : str
        Path to the read-data file.

    amp_len : int
        Number of bases covered by a single amplicon.

    overlap : int
        The number of bases that neighboring amplicons overlap.

    ref_len : int
        Length of the entire reference genome.

    Returns
    -------
    amplicons : bioa.Amplicons
        Amplicon data structure containing all distinct reads and meta-data.
    """

    amplicons = []
    with open(filename, "r") as data_file:
        amplicons = b.Amplicons.build_uniform_amplicons(data_file, amp_len, \
                        overlap, ref_len)

    return amplicons


[docs]def write_amplicons_to_fasta(filename, amplicons):
    """ Write amplicon sequences to file.

    Parameters
    ----------
    filename : str
        Filename to write amplicon sequence data to.

    amplicons : bioa.Amplicons
        Amplicon data structure containing all distinct reads and meta-data.

    Returns
    -------
    None
    """

    with open(filename, "w") as fasta_file:
        for amplicon in amplicons:
            SeqIO.write(amplicon, fasta_file, "fasta")
    return
Navigation

Quick search

Source code for bioa.io.amplicon

Navigation