Source code for bioa.io.sam

# -*- coding: utf-8 -*-
"""
================================
Grab read data from SAM/BAM file
================================
"""

__all__ = ["reads_from_sam",
           "reads_from_sam_quick"]

__author__ = "\n".join(["Nicholas Mancuso (nick.mancuso@gmail.com)",
                       "Alex Artyomenko (artyomenkoav@gmail.com)"])
#   Copyright (C) 2011-2012 by
#   Nicholas Mancuso <nick.mancuso@gmail.com>
#   Alex Artyomenko <artyomenkoav@gmail.com>
#   All rights reserved.
#   BSD license.

import sys
from collections import defaultdict

MATCH = 0
INSERTION = 1
DELETION = 2
CLIPPING = 4

INSERT_CHAR = "-"
DELETE_CHAR = "-"


[docs]def reads_from_sam_quick(sam_file):
    """ Quickly parse aligned reads from SAM/BAM file.
        This only returns reads and does not attempt
        to pad.

        Parameters
        ----------
        sam_file : pysam.SamFile
            The SAM/BAM file object.

        Returns
        -------
        read_pos_iter : (str, int) iterable
            Iterable of read-pairs indicating the read
            and its starting position in the reference.
    """
    for aligned_read in sam_file:
        if aligned_read.is_unmapped:
            continue

        yield (str(aligned_read.seq), aligned_read.pos)

    return


[docs]def reads_from_sam(sam_file, reference):
    """ Parse aligned reads from SAM/BAM file.

        Parameters
        ----------
        sam_file : pysam.SamFile
            The SAM/BAM file object.

        reference : Bio.Seq
            The reference sequence used for alignment.

        Returns
        -------
        align_start, align_stop, reads, ref : (int, int, list, Bio.Seq)
            The alignment start, alignment stop, list of reads (str, int), and
            extended reference. Each (str, int) pair represents the read and its
            starting position.
    """
    from Bio.Seq import Seq

    align_start = sys.maxint
    ref_inserts = defaultdict(int)
    for aligned_read in sam_file:
        # skip unmapped reads
        if aligned_read.is_unmapped:
            continue

        insertions = _calculate_insertions(aligned_read)
        for pos in insertions:
            ref_inserts[pos] = max(ref_inserts[pos], insertions[pos])

    # extend the reference
    new_ref = []
    for idx, nucleotide in enumerate(reference):
        new_ref.append(nucleotide)
        if idx in ref_inserts:
            new_ref.append(INSERT_CHAR * ref_inserts[idx])

    sam_file.reset()
    cache = defaultdict(int)
    align_start, align_stop, reads = _adjust_all_reads(sam_file, ref_inserts, cache)
    sam_file.close()

    return align_start, align_stop, reads, Seq("".join(new_ref))


def _adjust_all_reads(sam_file, ref_inserts, cache):
    """ Count the offset to give aligned start position.
        Adjust read and output it to ira format file.
    """
    reads = []
    align_start, align_stop = sys.maxint, 0
    for aligned_read in sam_file:
        # skip unmapped reads
        if aligned_read.is_unmapped:
            continue

        offset = _cached_pos_offset(cache, ref_inserts, aligned_read.pos)
        adj_read = _adjust_read(aligned_read, ref_inserts)
        start = aligned_read.pos + offset + 1
        align_start = min(align_start, start)
        align_stop = max(align_stop, start + len(adj_read))
        reads.append((adj_read, start))

    return align_start, align_stop, reads


def _calculate_insertions(aligned_read):
    """ Calculate the insertion positions to add to the extended reference.

        Parameters
        ----------
        aligned_read : pysam.AlignedRead
            The aligned read from a sam/bam file

        Returns
        -------
        insertions : Iterable of insertion positions
    """
    offset = 0
    ins = defaultdict(int)
    cigar = aligned_read.cigar
    pos = aligned_read.pos
    for op, count in cigar:
        if op == INSERTION:
            # output insertion positions
            ins[pos + offset - 1] = max(ins[pos + offset - 1], count)
        elif op < 2:
            # need to offset for both MATCH and INSERTION
            offset += count
    return ins


def _retrieve_insertion(new_read):
    """ Before going to insertion on cigar last insertion
        should be retrieved.
    """
    # ? What is the point of this, to remove last insertions?
    tm = new_read.pop()
    if len(tm) > 0 and tm[0] != INSERT_CHAR:
        new_read.append(tm)


def _adjust_read(aligned_read, inserts):
    """ Adjust the read to have extended characters represented by
        deleted positions.

        Parameters
        ----------
        aligned_read : pysam.AlignedRead
            The aligned read to add deletion characters to.

        inserts : dict
            Dictionary of indicies indicating an insertion in the original
            reference.

        Returns
        -------
        extended_read : Bio.Seq
            Extended sequence
    """
    # offset on reference and read
    g_offset, l_offset = 0, 0
    read = []
    for op, count in aligned_read.cigar:
        pos = aligned_read.pos
        if op == DELETION or op == MATCH:
            # append the number of deletion chars
            # append the bases, but also check if position
            # is an insertion char in the extended ref
            for i in range(count):
                char = DELETE_CHAR if op == DELETION \
                        else aligned_read.seq[i + l_offset]
                read.append(char)
                cur_pos = i + g_offset + pos
                if cur_pos in inserts and inserts[cur_pos]:
                    read.append(INSERT_CHAR * inserts[cur_pos])

            g_offset += count

        elif op == INSERTION:
            # append the insertion bases
            _retrieve_insertion(read)
            read.append(aligned_read.seq[l_offset:l_offset + count])
            ins = inserts[g_offset + pos - 1]
            if count < ins:
                read.append(INSERT_CHAR * (ins - count))

        if op == MATCH or op == INSERTION or op == CLIPPING:
            l_offset += count
        elif op == DELETION:
            l_offset -= count

    return "".join(read)


def _cached_pos_offset(cache, ref_inserts, pos):
    if pos - 1 in cache:
        return cache[pos - 1]

    val = 0
    for i in range(pos):
        val += ref_inserts[i]
        cache[i] = val

    return cache[pos - 1]
Navigation

Quick search

Source code for bioa.io.sam

Navigation